From 06148f8e38a7cf4ba297afe207a4b6db676f1b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 13 Dec 2024 15:19:05 +0000 Subject: [PATCH 01/80] wip --- flow/connectors/core.go | 5 +- flow/connectors/mysql/cdc.go | 100 +++++++++++++++++++ flow/connectors/mysql/mysql.go | 40 ++++++-- flow/connectors/postgres/postgres.go | 2 +- flow/connectors/postgres/ssh_wrapped_pool.go | 5 +- flow/go.mod | 11 ++ flow/go.sum | 31 +++++- 7 files changed, 180 insertions(+), 14 deletions(-) create mode 100644 flow/connectors/mysql/cdc.go diff --git a/flow/connectors/core.go b/flow/connectors/core.go index 0e1081dec5..ddf2a99d42 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -44,7 +44,7 @@ type ValidationConnector interface { type GetTableSchemaConnector interface { Connector - // GetTableSchema returns the schema of a table in terms of QValueKind. + // GetTableSchema returns the schema of a table in terms of type system. GetTableSchema( ctx context.Context, env map[string]string, @@ -404,7 +404,7 @@ func GetConnector(ctx context.Context, env map[string]string, config *protos.Pee case *protos.Peer_SqlserverConfig: return connsqlserver.NewSQLServerConnector(ctx, inner.SqlserverConfig) case *protos.Peer_MysqlConfig: - return connmysql.MySqlConnector{}, nil + return connmysql.NewMySqlConnector(ctx, inner.MysqlConfig) case *protos.Peer_ClickhouseConfig: return connclickhouse.NewClickHouseConnector(ctx, env, inner.ClickhouseConfig) case *protos.Peer_KafkaConfig: @@ -450,6 +450,7 @@ func CloseConnector(ctx context.Context, conn Connector) { // create type assertions to cause compile time error if connector interface not implemented var ( _ CDCPullConnector = &connpostgres.PostgresConnector{} + _ CDCPullConnector = &connmysql.MySqlConnector{} _ CDCPullPgConnector = &connpostgres.PostgresConnector{} diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go new file mode 100644 index 0000000000..3c581533e0 --- /dev/null +++ b/flow/connectors/mysql/cdc.go @@ -0,0 +1,100 @@ +package connmysql + +import ( + "context" + + "github.com/go-mysql-org/go-mysql/mysql" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/PeerDB-io/peerdb/flow/alerting" + "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/model" + "github.com/PeerDB-io/peerdb/flow/otel_metrics" +) + +func (c *MySqlConnector) GetTableSchema( + ctx context.Context, + env map[string]string, + system protos.TypeSystem, + tableIdentifiers []string, +) (map[string]*protos.TableSchema, error) { + panic("TODO") +} + +func (c *MySqlConnector) EnsurePullability(ctx context.Context, req *protos.EnsurePullabilityBatchInput) ( + *protos.EnsurePullabilityBatchOutput, error) + +func (c *MySqlConnector) ExportTxSnapshot(context.Context) (*protos.ExportTxSnapshotOutput, any, error) { + // https://dev.mysql.com/doc/refman/8.4/en/replication-howto-masterstatus.html + return nil, nil, nil +} + +func (c *MySqlConnector) FinishExport(any) error { + return nil +} + +func (c *MySqlConnector) SetupReplConn(context.Context) error { + // mysql code will spin up new connection for each normalize for now + return nil +} + +func (c *MySqlConnector) startCdcStreaming(lastOffsetName string, lastOffsetPos uint32) error { + // TODO prefer GTID + streamer, err := c.syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) + if err != nil { + return err + } + c.streamer = streamer + return nil +} + +func (c *MySqlConnector) ReplPing(context.Context) error { + return nil +} + +func (c *MySqlConnector) UpdateReplStateLastOffset(lastOffset int64) { + /* + if c.replState != nil { + c.replState.LastOffset.Store(lastOffset) + } + */ +} + +func (c *MySqlConnector) PullFlowCleanup(ctx context.Context, jobName string) error { + return nil +} + +func (c *MySqlConnector) HandleSlotInfo( + ctx context.Context, + alerter *alerting.Alerter, + catalogPool *pgxpool.Pool, + alertKeys *alerting.AlertKeys, + slotMetricGauges otel_metrics.SlotMetricGauges, +) error { + return nil +} + +func (c *MySqlConnector) GetSlotInfo(ctx context.Context, slotName string) ([]*protos.SlotInfo, error) { + return nil, nil +} + +func (c *MySqlConnector) AddTablesToPublication(ctx context.Context, req *protos.AddTablesToPublicationInput) error { + panic("TODO") +} + +func (c *MySqlConnector) RemoveTablesFromPublication(ctx context.Context, req *protos.RemoveTablesFromPublicationInput) error { + panic("TODO") +} + +func (c *MySqlConnector) PullRecords( + ctx context.Context, + catalogPool *pgxpool.Pool, + otelManager *otel_metrics.OtelManager, + req *model.PullRecordsRequest[model.RecordItems], +) error { + defer func() { + req.RecordStream.Close() + // update replState Offset + }() + return nil +} diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index d898826e1f..e9ba8807a7 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -1,15 +1,43 @@ -// stub to bypass validation +// https://airbyte.com/blog/replicating-mysql-a-look-at-the-binlog-and-gtids -package mysql +package connmysql -import "context" +import ( + "context" -type MySqlConnector struct{} + "github.com/go-mysql-org/go-mysql/replication" -func (MySqlConnector) Close() error { + "github.com/PeerDB-io/peerdb/flow/generated/protos" +) + +type MySqlConnector struct { + config *protos.MySqlConfig + syncer *replication.BinlogSyncer + streamer *replication.BinlogStreamer +} + +func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlConnector, error) { + syncer := replication.NewBinlogSyncer(replication.BinlogSyncerConfig{ + ServerID: 1729, // TODO put in config + Flavor: "mysql", // TODO put in config + Host: config.Host, + Port: uint16(config.Port), + User: config.User, + Password: config.Password, + }) + return &MySqlConnector{ + config: config, + syncer: syncer, + }, nil +} + +func (c *MySqlConnector) Close() error { + if c.syncer != nil { + c.syncer.Close() + } return nil } -func (MySqlConnector) ConnectionActive(context.Context) error { +func (*MySqlConnector) ConnectionActive(context.Context) error { return nil } diff --git a/flow/connectors/postgres/postgres.go b/flow/connectors/postgres/postgres.go index 1df5970f2b..3b7a302773 100644 --- a/flow/connectors/postgres/postgres.go +++ b/flow/connectors/postgres/postgres.go @@ -125,7 +125,7 @@ func (c *PostgresConnector) fetchCustomTypeMapping(ctx context.Context) (map[uin } func (c *PostgresConnector) CreateReplConn(ctx context.Context) (*pgx.Conn, error) { - // create a separate connection pool for non-replication queries as replication connections cannot + // create a separate connection for non-replication queries as replication connections cannot // be used for extended query protocol, i.e. prepared statements replConfig, err := pgx.ParseConfig(c.connStr) if err != nil { diff --git a/flow/connectors/postgres/ssh_wrapped_pool.go b/flow/connectors/postgres/ssh_wrapped_pool.go index 09f394f83a..6a2a2bdc36 100644 --- a/flow/connectors/postgres/ssh_wrapped_pool.go +++ b/flow/connectors/postgres/ssh_wrapped_pool.go @@ -125,15 +125,14 @@ func (tunnel *SSHTunnel) NewPostgresConnFromConfig( } host := connConfig.Host - err = retryWithBackoff(logger, func() error { + if err := retryWithBackoff(logger, func() error { _, err := conn.Exec(ctx, "SELECT 1") if err != nil { logger.Error("Failed to ping pool", slog.Any("error", err), slog.String("host", host)) return err } return nil - }, 5, 5*time.Second) - if err != nil { + }, 5, 5*time.Second); err != nil { logger.Error("Failed to create pool", slog.Any("error", err), slog.String("host", host)) conn.Close(ctx) return nil, err diff --git a/flow/go.mod b/flow/go.mod index 7d43194e45..4fa6b0c833 100644 --- a/flow/go.mod +++ b/flow/go.mod @@ -28,6 +28,7 @@ require ( github.com/aws/smithy-go v1.22.1 github.com/cockroachdb/pebble v1.1.3 github.com/elastic/go-elasticsearch/v8 v8.17.0 + github.com/go-mysql-org/go-mysql v1.10.0 github.com/google/uuid v1.6.0 github.com/grafana/pyroscope-go v1.2.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 @@ -87,6 +88,7 @@ require ( github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.49.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.49.0 // indirect github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c // indirect + github.com/Masterminds/semver v1.5.0 // indirect github.com/apache/arrow/go/v15 v15.0.2 // indirect github.com/apache/arrow/go/v16 v16.1.0 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.23 // indirect @@ -144,6 +146,9 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/nexus-rpc/sdk-go v0.1.0 // indirect github.com/paulmach/orb v0.11.1 // indirect + github.com/pingcap/errors v0.11.5-0.20240311024730-e056997136bb // indirect + github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22 // indirect + github.com/pingcap/tidb/pkg/parser v0.0.0-20250107130058-67249cb2fe50 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/prometheus/client_golang v1.20.5 // indirect @@ -152,6 +157,8 @@ require ( github.com/prometheus/procfs v0.15.1 // indirect github.com/rogpeppe/go-internal v1.13.1 // indirect github.com/segmentio/asm v1.2.0 // indirect + github.com/siddontang/go v0.0.0-20180604090527-bdc77568d726 // indirect + github.com/siddontang/go-log v0.0.0-20190221022429-1e957dd83bed // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/twmb/franz-go/pkg/kmsg v1.9.0 // indirect github.com/x448/float16 v0.8.4 // indirect @@ -161,10 +168,14 @@ require ( go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect go.opentelemetry.io/otel/trace v1.33.0 // indirect go.opentelemetry.io/proto/otlp v1.5.0 // indirect + go.uber.org/atomic v1.11.0 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect golang.org/x/mod v0.22.0 // indirect golang.org/x/term v0.28.0 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect k8s.io/api v0.32.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 // indirect diff --git a/flow/go.sum b/flow/go.sum index d965bfcdab..c82ea07e47 100644 --- a/flow/go.sum +++ b/flow/go.sum @@ -84,6 +84,8 @@ github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapp github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.49.0/go.mod h1:wRbFgBQUVm1YXrvWKofAEmq9HNJTDphbAaJSSX01KUI= github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU= github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= github.com/PeerDB-io/glua64 v1.0.1 h1:biXLlFF/L5pnJCwDon7hkWkuQPozC8NjKS3J7Wzi69I= github.com/PeerDB-io/glua64 v1.0.1/go.mod h1:UHmAhniv61bJPMhQvxkpC7jXbn353dSbQviu83bgQVg= github.com/PeerDB-io/gluabit32 v1.0.2 h1:AGI1Z7dwDVotakpuOOuyTX4/QGi5HUYsipL/VfodmO4= @@ -237,6 +239,8 @@ github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-mysql-org/go-mysql v1.10.0 h1:9iEPrZdHKq6EepUuPONrBA+wc3aL1WLhbUm5w8ryDFg= +github.com/go-mysql-org/go-mysql v1.10.0/go.mod h1:GzFQAI+FqbYAPtsannL0hmZH6zcLzCQbwqopT9bgTt0= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= @@ -427,8 +431,13 @@ github.com/pborman/uuid v1.2.1 h1:+ZZIw58t/ozdjRaXh/3awHfmWRbzYxJoAdNJxe/3pvw= github.com/pborman/uuid v1.2.1/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= -github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= -github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= +github.com/pingcap/errors v0.11.0/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= +github.com/pingcap/errors v0.11.5-0.20240311024730-e056997136bb h1:3pSi4EDG6hg0orE1ndHkXvX6Qdq2cZn8gAPir8ymKZk= +github.com/pingcap/errors v0.11.5-0.20240311024730-e056997136bb/go.mod h1:X2r9ueLEUZgtx2cIogM0v4Zj5uvvzhuuiu7Pn8HzMPg= +github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22 h1:2SOzvGvE8beiC1Y4g9Onkvu6UmuBBOeWRGQEjJaT/JY= +github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= +github.com/pingcap/tidb/pkg/parser v0.0.0-20250107130058-67249cb2fe50 h1:AGpzBz+O1vpdySRD5MWb9HsN9vMs1gUw/NyfY2AzQHQ= +github.com/pingcap/tidb/pkg/parser v0.0.0-20250107130058-67249cb2fe50/go.mod h1:Hju1TEWZvrctQKbztTRwXH7rd41Yq0Pgmq4PrEKcq7o= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= @@ -462,6 +471,10 @@ github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys= github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs= github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= +github.com/siddontang/go v0.0.0-20180604090527-bdc77568d726 h1:xT+JlYxNGqyT+XcU8iUrN18JYed2TvG9yN5ULG2jATM= +github.com/siddontang/go v0.0.0-20180604090527-bdc77568d726/go.mod h1:3yhqj7WBBfRhbBlzyOC3gUxftwsU0u8gqevxwIHQpMw= +github.com/siddontang/go-log v0.0.0-20190221022429-1e957dd83bed h1:KMgQoLJGCq1IoZpLZE3AIffh9veYWoVlsvA4ib55TMM= +github.com/siddontang/go-log v0.0.0-20190221022429-1e957dd83bed/go.mod h1:yFdBgwXP24JziuRl2NMUahT7nGLNOKi1SIiFxMttVD4= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= @@ -557,14 +570,24 @@ go.temporal.io/sdk v1.31.0 h1:CLYiP0R5Sdj0gq8LyYKDDz4ccGOdJPR8wNGJU0JGwj8= go.temporal.io/sdk v1.31.0/go.mod h1:8U8H7rF9u4Hyb4Ry9yiEls5716DHPNvVITPNkgWUwE8= go.temporal.io/sdk/contrib/opentelemetry v0.6.0 h1:rNBArDj5iTUkcMwKocUShoAW59o6HdS7Nq4CTp4ldj8= go.temporal.io/sdk/contrib/opentelemetry v0.6.0/go.mod h1:Lem8VrE2ks8P+FYcRM3UphPoBr+tfM3v/Kaf0qStzSg= +go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= +go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= +go.uber.org/multierr v1.7.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.18.1/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= +go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -638,6 +661,7 @@ golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= @@ -697,6 +721,9 @@ gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSP gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k= +gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= +gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= From e0e005b97aada3b3dc20b6c68ad516144d8be061 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 13 Dec 2024 22:30:48 +0000 Subject: [PATCH 02/80] wip2 --- flow/connectors/external_metadata/store.go | 54 +++++------ flow/connectors/mysql/cdc.go | 14 ++- flow/connectors/mysql/mysql.go | 94 ++++++++++++++++++- .../migrations/V42__mysql_metadata.sql | 10 ++ protos/peers.proto | 1 + 5 files changed, 139 insertions(+), 34 deletions(-) create mode 100644 nexus/catalog/migrations/V42__mysql_metadata.sql diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index eabe2509f7..6e72f963e3 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -49,9 +49,8 @@ func NewPostgresMetadataFromCatalog(logger log.Logger, pool *pgxpool.Pool) *Post } func (p *PostgresMetadata) Ping(ctx context.Context) error { - pingErr := p.pool.Ping(ctx) - if pingErr != nil { - return fmt.Errorf("metadata db ping failed: %w", pingErr) + if err := p.pool.Ping(ctx); err != nil { + return fmt.Errorf("metadata db ping failed: %w", err) } return nil @@ -73,13 +72,11 @@ func (p *PostgresMetadata) SetupMetadataTables(_ context.Context) error { } func (p *PostgresMetadata) GetLastOffset(ctx context.Context, jobName string) (int64, error) { - row := p.pool.QueryRow(ctx, - `SELECT last_offset FROM `+ - lastSyncStateTableName+ - ` WHERE job_name = $1`, jobName) var offset pgtype.Int8 - err := row.Scan(&offset) - if err != nil { + if err := p.pool.QueryRow(ctx, + `SELECT last_offset FROM `+lastSyncStateTableName+` WHERE job_name = $1`, + jobName, + ).Scan(&offset); err != nil { if err == pgx.ErrNoRows { return 0, nil } @@ -94,12 +91,11 @@ func (p *PostgresMetadata) GetLastOffset(ctx context.Context, jobName string) (i } func (p *PostgresMetadata) GetLastSyncBatchID(ctx context.Context, jobName string) (int64, error) { - row := p.pool.QueryRow(ctx, - `SELECT sync_batch_id FROM `+lastSyncStateTableName+` WHERE job_name = $1`, jobName) - var syncBatchID pgtype.Int8 - err := row.Scan(&syncBatchID) - if err != nil { + if err := p.pool.QueryRow(ctx, + `SELECT sync_batch_id FROM `+lastSyncStateTableName+` WHERE job_name = $1`, + jobName, + ).Scan(&syncBatchID); err != nil { // if the job doesn't exist, return 0 if err == pgx.ErrNoRows { return 0, nil @@ -114,14 +110,11 @@ func (p *PostgresMetadata) GetLastSyncBatchID(ctx context.Context, jobName strin } func (p *PostgresMetadata) GetLastNormalizeBatchID(ctx context.Context, jobName string) (int64, error) { - rows := p.pool.QueryRow(ctx, - `SELECT normalize_batch_id FROM `+ - lastSyncStateTableName+ - ` WHERE job_name = $1`, jobName) - var normalizeBatchID pgtype.Int8 - err := rows.Scan(&normalizeBatchID) - if err != nil { + if err := p.pool.QueryRow(ctx, + `SELECT normalize_batch_id FROM `+lastSyncStateTableName+` WHERE job_name = $1`, + jobName, + ).Scan(&normalizeBatchID); err != nil { // if the job doesn't exist, return 0 if err.Error() == "no rows in result set" { return 0, nil @@ -137,14 +130,13 @@ func (p *PostgresMetadata) GetLastNormalizeBatchID(ctx context.Context, jobName func (p *PostgresMetadata) SetLastOffset(ctx context.Context, jobName string, offset int64) error { p.logger.Debug("updating last offset", slog.String("offset", pglogrepl.LSN(offset).String())) - _, err := p.pool.Exec(ctx, ` + if _, err := p.pool.Exec(ctx, ` INSERT INTO `+lastSyncStateTableName+` (job_name, last_offset, sync_batch_id) VALUES ($1, $2, $3) ON CONFLICT (job_name) DO UPDATE SET last_offset = GREATEST(`+lastSyncStateTableName+`.last_offset, excluded.last_offset), updated_at = NOW() - `, jobName, offset, 0) - if err != nil { + `, jobName, offset, 0); err != nil { p.logger.Error("failed to update last offset", "error", err) return err } @@ -154,7 +146,7 @@ func (p *PostgresMetadata) SetLastOffset(ctx context.Context, jobName string, of func (p *PostgresMetadata) FinishBatch(ctx context.Context, jobName string, syncBatchID int64, offset int64) error { p.logger.Info("finishing batch", "SyncBatchID", syncBatchID, "offset", offset) - _, err := p.pool.Exec(ctx, ` + if _, err := p.pool.Exec(ctx, ` INSERT INTO `+lastSyncStateTableName+` (job_name, last_offset, sync_batch_id) VALUES ($1, $2, $3) ON CONFLICT (job_name) @@ -162,8 +154,7 @@ func (p *PostgresMetadata) FinishBatch(ctx context.Context, jobName string, sync last_offset = GREATEST(`+lastSyncStateTableName+`.last_offset, excluded.last_offset), sync_batch_id = GREATEST(`+lastSyncStateTableName+`.sync_batch_id, excluded.sync_batch_id), updated_at = NOW() - `, jobName, offset, syncBatchID) - if err != nil { + `, jobName, offset, syncBatchID); err != nil { p.logger.Error("failed to finish batch", slog.Any("error", err)) return err } @@ -221,14 +212,13 @@ func (p *PostgresMetadata) IsQRepPartitionSynced(ctx context.Context, req *proto } func (p *PostgresMetadata) SyncFlowCleanup(ctx context.Context, jobName string) error { - _, err := p.pool.Exec(ctx, - `DELETE FROM `+lastSyncStateTableName+` WHERE job_name = $1`, jobName) - if err != nil { + if _, err := p.pool.Exec(ctx, + `DELETE FROM `+lastSyncStateTableName+` WHERE job_name = $1`, jobName, + ); err != nil { return err } - _, err = p.pool.Exec(ctx, `DELETE FROM `+qrepTableName+` WHERE job_name = $1`, jobName) - if err != nil { + if _, err := p.pool.Exec(ctx, `DELETE FROM `+qrepTableName+` WHERE job_name = $1`, jobName); err != nil { return err } diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 3c581533e0..b8e0bf467a 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -38,7 +38,7 @@ func (c *MySqlConnector) SetupReplConn(context.Context) error { return nil } -func (c *MySqlConnector) startCdcStreaming(lastOffsetName string, lastOffsetPos uint32) error { +func (c *MySqlConnector) startCdcStreamingFilePos(lastOffsetName string, lastOffsetPos uint32) error { // TODO prefer GTID streamer, err := c.syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) if err != nil { @@ -48,6 +48,17 @@ func (c *MySqlConnector) startCdcStreaming(lastOffsetName string, lastOffsetPos return nil } +func (c *MySqlConnector) startCdcStreamingGtid(lastOffsetName string, lastOffsetPos uint32) error { + // https: //hevodata.com/learn/mysql-gtids-and-replication-set-up + // TODO prefer GTID + streamer, err := c.syncer.StartSyncGTID(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) + if err != nil { + return err + } + c.streamer = streamer + return nil +} + func (c *MySqlConnector) ReplPing(context.Context) error { return nil } @@ -96,5 +107,6 @@ func (c *MySqlConnector) PullRecords( req.RecordStream.Close() // update replState Offset }() + c.startCdcStreaming(req.LastOffset) return nil } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index e9ba8807a7..71a2418202 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -4,14 +4,22 @@ package connmysql import ( "context" + "crypto/tls" + "fmt" + "time" + "github.com/go-mysql-org/go-mysql/client" + "github.com/go-mysql-org/go-mysql/mysql" "github.com/go-mysql-org/go-mysql/replication" + metadataStore "github.com/PeerDB-io/peerdb/flow/connectors/external_metadata" "github.com/PeerDB-io/peerdb/flow/generated/protos" ) type MySqlConnector struct { + *metadataStore.PostgresMetadata config *protos.MySqlConfig + conn *client.Conn syncer *replication.BinlogSyncer streamer *replication.BinlogStreamer } @@ -35,9 +43,93 @@ func (c *MySqlConnector) Close() error { if c.syncer != nil { c.syncer.Close() } + if c.conn != nil { + return c.conn.Close() + } return nil } -func (*MySqlConnector) ConnectionActive(context.Context) error { +func (c *MySqlConnector) ConnectionActive(context.Context) error { + if c.conn != nil { + return c.conn.Ping() + } return nil } + +func (c *MySqlConnector) connect(ctx context.Context, options ...client.Option) (*client.Conn, error) { + return client.ConnectWithContext(ctx, fmt.Sprintf("%s:%d", c.config.Host, c.config.Port), + c.config.User, c.config.Password, c.config.Database, time.Minute, options...) +} + +func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interface{}) (*mysql.Result, error) { + reconnects := 3 + for { + if c.conn == nil { + var err error + var argF []client.Option + if !c.config.DisableTls { + argF = append(argF, func(conn *client.Conn) error { + conn.SetTLSConfig(&tls.Config{MinVersion: tls.VersionTLS13}) + return nil + }) + } + c.conn, err = c.connect(ctx, argF...) + if err != nil { + return nil, fmt.Errorf("failed to connect to mysql server: %w", err) + } + } + + rr, err := c.conn.Execute(cmd, args...) + if err != nil { + if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { + reconnects -= 1 + c.conn.Close() + c.conn = nil + continue + } + return nil, err + } + return rr, nil + } +} + +func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, error) { + showBinlogStatus := "SHOW BINARY LOG STATUS" + if eq, err := c.conn.CompareServerVersion("8.4.0"); (err == nil) && (eq < 0) { + showBinlogStatus = "SHOW MASTER STATUS" + } + + rr, err := c.Execute(ctx, showBinlogStatus) + if err != nil { + return mysql.Position{}, fmt.Errorf("failed to SHOW BINARY LOG STATUS: %w", err) + } + + // TODO: check error? + name, _ := rr.GetString(0, 0) + pos, _ := rr.GetInt(0, 1) + + return mysql.Position{Name: name, Pos: uint32(pos)}, nil +} + +func (c *MySqlConnector) GetMasterGTIDSet(ctx context.Context) (mysql.GTIDSet, error) { + query := "" + switch c.config.Flavor { + case mysql.MariaDBFlavor: + query = "SELECT @@GLOBAL.gtid_current_pos" + default: + query = "SELECT @@GLOBAL.GTID_EXECUTED" + } + rr, err := c.Execute(ctx, query) + if err != nil { + return nil, fmt.Errorf("failed to SELECT @@GLOBAL.GTID_EXECUTED", err) + } + gx, err := rr.GetString(0, 0) + if err != nil { + return nil, fmt.Errorf("failed to GetString for GTID_EXECUTED", err) + } + gset, err := mysql.ParseGTIDSet(c.config.Flavor, gx) + if err != nil { + return nil, fmt.Errorf("failed to parse GTID from GTID_EXECUTED: %w", err) + } + return gset, nil +} diff --git a/nexus/catalog/migrations/V42__mysql_metadata.sql b/nexus/catalog/migrations/V42__mysql_metadata.sql new file mode 100644 index 0000000000..fe128403ec --- /dev/null +++ b/nexus/catalog/migrations/V42__mysql_metadata.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS metadata_mysql_sync_state ( + job_name TEXT PRIMARY KEY NOT NULL, + pos_file text, + pos_offset int, + gtid text, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + sync_batch_id BIGINT NOT NULL, + normalize_batch_id BIGINT +); + diff --git a/protos/peers.proto b/protos/peers.proto index a33cfd5a1d..73b8c7c7a5 100644 --- a/protos/peers.proto +++ b/protos/peers.proto @@ -142,6 +142,7 @@ message MySqlConfig { repeated string setup = 6; uint32 compression = 7; bool disable_tls = 8; + string flavor = 9; } message KafkaConfig { From 27e9a12f89d7f00caaa876a8ff4e8bb96d3de5b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Sun, 15 Dec 2024 04:20:45 +0000 Subject: [PATCH 03/80] some cdc loop sketch, fat checkpoint type --- flow/connectors/bigquery/qrep_avro_sync.go | 10 +- flow/connectors/clickhouse/cdc.go | 12 +-- flow/connectors/core.go | 6 +- .../connectors/elasticsearch/elasticsearch.go | 12 +-- flow/connectors/eventhub/eventhub.go | 12 +-- flow/connectors/external_metadata/store.go | 40 ++++---- flow/connectors/kafka/kafka.go | 12 +-- flow/connectors/mysql/cdc.go | 97 ++++++++++++++----- flow/connectors/mysql/mysql.go | 20 ++-- flow/connectors/postgres/cdc.go | 8 +- flow/connectors/postgres/client.go | 14 +-- flow/connectors/postgres/postgres.go | 43 ++++---- flow/connectors/pubsub/pubsub.go | 12 +-- flow/connectors/s3/s3.go | 8 +- flow/connectors/snowflake/snowflake.go | 12 +-- .../connectors/utils/monitoring/monitoring.go | 6 +- flow/model/cdc_stream.go | 47 +++++---- flow/model/model.go | 6 +- flow/pua/stream_adapter.go | 4 +- .../migrations/V42__mysql_metadata.sql | 10 +- 20 files changed, 221 insertions(+), 170 deletions(-) diff --git a/flow/connectors/bigquery/qrep_avro_sync.go b/flow/connectors/bigquery/qrep_avro_sync.go index 95d5ffb9e5..cb4224f13a 100644 --- a/flow/connectors/bigquery/qrep_avro_sync.go +++ b/flow/connectors/bigquery/qrep_avro_sync.go @@ -103,11 +103,11 @@ func (s *QRepAvroSyncMethod) SyncRecords( } return &model.SyncResponse{ - LastSyncedCheckpointID: lastCP, - NumRecordsSynced: int64(numRecords), - CurrentSyncBatchID: syncBatchID, - TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: req.Records.SchemaDeltas, + LastSyncedCheckpoint: lastCP, + NumRecordsSynced: int64(numRecords), + CurrentSyncBatchID: syncBatchID, + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 6442f7d1dc..c1205a34c5 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -97,11 +97,11 @@ func (c *ClickHouseConnector) syncRecordsViaAvro( } return &model.SyncResponse{ - LastSyncedCheckpointID: req.Records.GetLastCheckpoint(), - NumRecordsSynced: int64(numRecords), - CurrentSyncBatchID: syncBatchID, - TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: req.Records.SchemaDeltas, + LastSyncedCheckpoint: req.Records.GetLastCheckpoint(), + NumRecordsSynced: int64(numRecords), + CurrentSyncBatchID: syncBatchID, + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } @@ -111,7 +111,7 @@ func (c *ClickHouseConnector) SyncRecords(ctx context.Context, req *model.SyncRe return nil, err } - if err := c.FinishBatch(ctx, req.FlowJobName, req.SyncBatchID, res.LastSyncedCheckpointID); err != nil { + if err := c.FinishBatch(ctx, req.FlowJobName, req.SyncBatchID, res.LastSyncedCheckpoint); err != nil { c.logger.Error("failed to increment id", slog.Any("error", err)) return nil, err } diff --git a/flow/connectors/core.go b/flow/connectors/core.go index ddf2a99d42..294eeeb62a 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -74,7 +74,7 @@ type CDCPullConnectorCore interface { ReplPing(context.Context) error // Called when offset has been confirmed to destination - UpdateReplStateLastOffset(lastOffset int64) + UpdateReplStateLastOffset(lastOffset model.CdcCheckpoint) // PullFlowCleanup drops both the Postgres publication and replication slot, as a part of DROP MIRROR PullFlowCleanup(ctx context.Context, jobName string) error @@ -156,10 +156,10 @@ type CDCSyncConnectorCore interface { SetupMetadataTables(ctx context.Context) error // GetLastOffset gets the last offset from the metadata table on the destination - GetLastOffset(ctx context.Context, jobName string) (int64, error) + GetLastOffset(ctx context.Context, jobName string) (model.CdcCheckpoint, error) // SetLastOffset updates the last offset on the metadata table on the destination - SetLastOffset(ctx context.Context, jobName string, lastOffset int64) error + SetLastOffset(ctx context.Context, jobName string, lastOffset model.CdcCheckpoint) error // GetLastSyncBatchID gets the last batch synced to the destination from the metadata table GetLastSyncBatchID(ctx context.Context, jobName string) (int64, error) diff --git a/flow/connectors/elasticsearch/elasticsearch.go b/flow/connectors/elasticsearch/elasticsearch.go index c87bb53986..fd77754534 100644 --- a/flow/connectors/elasticsearch/elasticsearch.go +++ b/flow/connectors/elasticsearch/elasticsearch.go @@ -163,7 +163,7 @@ func (esc *ElasticsearchConnector) SyncRecords(ctx context.Context, case <-ticker.C: lastSeen := lastSeenLSN.Load() if lastSeen > req.ConsumedOffset.Load() { - if err := esc.SetLastOffset(ctx, req.FlowJobName, lastSeen); err != nil { + if err := esc.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: lastSeen}); err != nil { esc.logger.Warn("[es] SetLastOffset error", slog.Any("error", err)) } else { shared.AtomicInt64Max(req.ConsumedOffset, lastSeen) @@ -297,10 +297,10 @@ func (esc *ElasticsearchConnector) SyncRecords(ctx context.Context, } return &model.SyncResponse{ - CurrentSyncBatchID: req.SyncBatchID, - LastSyncedCheckpointID: lastCheckpoint, - NumRecordsSynced: numRecords, - TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: req.Records.SchemaDeltas, + CurrentSyncBatchID: req.SyncBatchID, + LastSyncedCheckpoint: lastCheckpoint, + NumRecordsSynced: numRecords, + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } diff --git a/flow/connectors/eventhub/eventhub.go b/flow/connectors/eventhub/eventhub.go index 153f57c5d9..788df4f754 100644 --- a/flow/connectors/eventhub/eventhub.go +++ b/flow/connectors/eventhub/eventhub.go @@ -319,7 +319,7 @@ func (c *EventHubConnector) processBatch( if err != nil { return 0, err } else if lastSeenLSN > req.ConsumedOffset.Load() { - if err := c.SetLastOffset(ctx, req.FlowJobName, lastSeenLSN); err != nil { + if err := c.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: lastSeenLSN}); err != nil { c.logger.Warn("[eventhubs] SetLastOffset error", slog.Any("error", err)) } else { shared.AtomicInt64Max(req.ConsumedOffset, lastSeenLSN) @@ -345,11 +345,11 @@ func (c *EventHubConnector) SyncRecords(ctx context.Context, req *model.SyncReco } return &model.SyncResponse{ - CurrentSyncBatchID: req.SyncBatchID, - LastSyncedCheckpointID: lastCheckpoint, - NumRecordsSynced: int64(numRecords), - TableNameRowsMapping: make(map[string]*model.RecordTypeCounts), - TableSchemaDeltas: req.Records.SchemaDeltas, + CurrentSyncBatchID: req.SyncBatchID, + LastSyncedCheckpoint: lastCheckpoint, + NumRecordsSynced: int64(numRecords), + TableNameRowsMapping: make(map[string]*model.RecordTypeCounts), + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index 6e72f963e3..7df5267588 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -15,6 +15,7 @@ import ( "github.com/PeerDB-io/peerdb/flow/connectors/utils/monitoring" "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/model" "github.com/PeerDB-io/peerdb/flow/peerdbenv" "github.com/PeerDB-io/peerdb/flow/shared" ) @@ -71,23 +72,23 @@ func (p *PostgresMetadata) SetupMetadataTables(_ context.Context) error { return nil } -func (p *PostgresMetadata) GetLastOffset(ctx context.Context, jobName string) (int64, error) { - var offset pgtype.Int8 +func (p *PostgresMetadata) GetLastOffset(ctx context.Context, jobName string) (model.CdcCheckpoint, error) { + var offset model.CdcCheckpoint if err := p.pool.QueryRow(ctx, - `SELECT last_offset FROM `+lastSyncStateTableName+` WHERE job_name = $1`, + `SELECT last_offset, last_text FROM `+lastSyncStateTableName+` WHERE job_name = $1`, jobName, - ).Scan(&offset); err != nil { + ).Scan(&offset.ID, &offset.Text); err != nil { if err == pgx.ErrNoRows { - return 0, nil + return offset, nil } p.logger.Error("failed to get last offset", "error", err) - return 0, err + return offset, err } - p.logger.Info("got last offset for job", "offset", offset.Int64) + p.logger.Info("got last offset for job", "offset", offset) - return offset.Int64, nil + return offset, nil } func (p *PostgresMetadata) GetLastSyncBatchID(ctx context.Context, jobName string) (int64, error) { @@ -128,15 +129,17 @@ func (p *PostgresMetadata) GetLastNormalizeBatchID(ctx context.Context, jobName return normalizeBatchID.Int64, nil } -func (p *PostgresMetadata) SetLastOffset(ctx context.Context, jobName string, offset int64) error { - p.logger.Debug("updating last offset", slog.String("offset", pglogrepl.LSN(offset).String())) +func (p *PostgresMetadata) SetLastOffset(ctx context.Context, jobName string, offset model.CdcCheckpoint) error { + p.logger.Debug("updating last offset", slog.String("offsetID", pglogrepl.LSN(offset.ID).String()), slog.String("offsetText", offset.Text)) if _, err := p.pool.Exec(ctx, ` - INSERT INTO `+lastSyncStateTableName+` (job_name, last_offset, sync_batch_id) - VALUES ($1, $2, $3) + INSERT INTO `+lastSyncStateTableName+` (job_name, last_offset, last_text, sync_batch_id) + VALUES ($1, $2, $3, $4) ON CONFLICT (job_name) - DO UPDATE SET last_offset = GREATEST(`+lastSyncStateTableName+`.last_offset, excluded.last_offset), + DO UPDATE SET + last_offset = GREATEST(`+lastSyncStateTableName+`.last_offset, excluded.last_offset), + last_text = excluded.last_text, updated_at = NOW() - `, jobName, offset, 0); err != nil { + `, jobName, offset.ID, offset.Text, 0); err != nil { p.logger.Error("failed to update last offset", "error", err) return err } @@ -144,17 +147,18 @@ func (p *PostgresMetadata) SetLastOffset(ctx context.Context, jobName string, of return nil } -func (p *PostgresMetadata) FinishBatch(ctx context.Context, jobName string, syncBatchID int64, offset int64) error { +func (p *PostgresMetadata) FinishBatch(ctx context.Context, jobName string, syncBatchID int64, offset model.CdcCheckpoint) error { p.logger.Info("finishing batch", "SyncBatchID", syncBatchID, "offset", offset) if _, err := p.pool.Exec(ctx, ` - INSERT INTO `+lastSyncStateTableName+` (job_name, last_offset, sync_batch_id) - VALUES ($1, $2, $3) + INSERT INTO `+lastSyncStateTableName+` (job_name, last_offset, last_text, sync_batch_id) + VALUES ($1, $2, $3, $4) ON CONFLICT (job_name) DO UPDATE SET last_offset = GREATEST(`+lastSyncStateTableName+`.last_offset, excluded.last_offset), + last_text = excluded.last_text, sync_batch_id = GREATEST(`+lastSyncStateTableName+`.sync_batch_id, excluded.sync_batch_id), updated_at = NOW() - `, jobName, offset, syncBatchID); err != nil { + `, jobName, offset.ID, offset.Text, syncBatchID); err != nil { p.logger.Error("failed to finish batch", slog.Any("error", err)) return err } diff --git a/flow/connectors/kafka/kafka.go b/flow/connectors/kafka/kafka.go index 2944a0e8c7..8f94b9d1db 100644 --- a/flow/connectors/kafka/kafka.go +++ b/flow/connectors/kafka/kafka.go @@ -313,7 +313,7 @@ func (c *KafkaConnector) SyncRecords(ctx context.Context, req *model.SyncRecords c.logger.Warn("[kafka] flush error", slog.Any("error", err)) continue } else if lastSeen > req.ConsumedOffset.Load() { - if err := c.SetLastOffset(ctx, req.FlowJobName, lastSeen); err != nil { + if err := c.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: lastSeen}); err != nil { c.logger.Warn("[kafka] SetLastOffset error", slog.Any("error", err)) } else { shared.AtomicInt64Max(req.ConsumedOffset, lastSeen) @@ -392,10 +392,10 @@ Loop: } return &model.SyncResponse{ - CurrentSyncBatchID: req.SyncBatchID, - LastSyncedCheckpointID: lastCheckpoint, - NumRecordsSynced: numRecords.Load(), - TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: req.Records.SchemaDeltas, + CurrentSyncBatchID: req.SyncBatchID, + LastSyncedCheckpoint: lastCheckpoint, + NumRecordsSynced: numRecords.Load(), + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index b8e0bf467a..b2c17d7f17 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -4,11 +4,13 @@ import ( "context" "github.com/go-mysql-org/go-mysql/mysql" + "github.com/go-mysql-org/go-mysql/replication" "github.com/jackc/pgx/v5/pgxpool" "github.com/PeerDB-io/peerdb/flow/alerting" "github.com/PeerDB-io/peerdb/flow/generated/protos" "github.com/PeerDB-io/peerdb/flow/model" + "github.com/PeerDB-io/peerdb/flow/model/qvalue" "github.com/PeerDB-io/peerdb/flow/otel_metrics" ) @@ -38,37 +40,20 @@ func (c *MySqlConnector) SetupReplConn(context.Context) error { return nil } -func (c *MySqlConnector) startCdcStreamingFilePos(lastOffsetName string, lastOffsetPos uint32) error { - // TODO prefer GTID - streamer, err := c.syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) - if err != nil { - return err - } - c.streamer = streamer - return nil +func (c *MySqlConnector) startCdcStreamingFilePos(lastOffsetName string, lastOffsetPos uint32) (*replication.BinlogStreamer, error) { + return c.syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) } -func (c *MySqlConnector) startCdcStreamingGtid(lastOffsetName string, lastOffsetPos uint32) error { - // https: //hevodata.com/learn/mysql-gtids-and-replication-set-up - // TODO prefer GTID - streamer, err := c.syncer.StartSyncGTID(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) - if err != nil { - return err - } - c.streamer = streamer - return nil +func (c *MySqlConnector) startCdcStreamingGtid(gset mysql.GTIDSet) (*replication.BinlogStreamer, error) { + // https://hevodata.com/learn/mysql-gtids-and-replication-set-up + return c.syncer.StartSyncGTID(gset) } func (c *MySqlConnector) ReplPing(context.Context) error { return nil } -func (c *MySqlConnector) UpdateReplStateLastOffset(lastOffset int64) { - /* - if c.replState != nil { - c.replState.LastOffset.Store(lastOffset) - } - */ +func (c *MySqlConnector) UpdateReplStateLastOffset(lastOffset model.CdcCheckpoint) { } func (c *MySqlConnector) PullFlowCleanup(ctx context.Context, jobName string) error { @@ -90,11 +75,16 @@ func (c *MySqlConnector) GetSlotInfo(ctx context.Context, slotName string) ([]*p } func (c *MySqlConnector) AddTablesToPublication(ctx context.Context, req *protos.AddTablesToPublicationInput) error { - panic("TODO") + return nil } func (c *MySqlConnector) RemoveTablesFromPublication(ctx context.Context, req *protos.RemoveTablesFromPublicationInput) error { - panic("TODO") + return nil +} + +func qvalueFromMysql(typ byte, val any) qvalue.QValue { + // TODO + return nil } func (c *MySqlConnector) PullRecords( @@ -107,6 +97,61 @@ func (c *MySqlConnector) PullRecords( req.RecordStream.Close() // update replState Offset }() - c.startCdcStreaming(req.LastOffset) + gset, err := mysql.ParseGTIDSet(c.config.Flavor, req.LastOffset.Text) + if err != nil { + return err + } + mystream, err := c.startCdcStreamingGtid(gset) + if err != nil { + return err + } + for { + event, err := mystream.GetEvent(ctx) + if err != nil { + return err + } + switch ev := event.Event.(type) { + case *replication.RowsEvent: + for _, row := range ev.Rows { + var record model.Record[model.RecordItems] + //TODO need tableNameMapping[source] -> destination + //TODO need mapping of column index to column name + var items model.RecordItems + switch event.Header.EventType { + case replication.WRITE_ROWS_EVENTv0, replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2: + for idx, val := range row { + // TODO + items.AddColumn("ColumnName", qvalueFromMysql(ev.Table.ColumnType[idx], val)) + } + record = &model.InsertRecord[model.RecordItems]{ + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + Items: items, + SourceTableName: string(ev.Table.Table), + } + case replication.UPDATE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2: + // TODO no OldItems / NewItems. How does primary key update work? + record = &model.UpdateRecord[model.RecordItems]{ + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + NewItems: items, + SourceTableName: string(ev.Table.Table), + } + case replication.DELETE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2: + record = &model.DeleteRecord[model.RecordItems]{ + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + Items: items, + SourceTableName: string(ev.Table.Table), + } + default: + continue + } + err := req.RecordStream.AddRecord(ctx, record) + if err != nil { + return err + } + } + break + } + break // TODO when batch ready + } return nil } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 71a2418202..3cb21b1bcb 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -18,20 +18,20 @@ import ( type MySqlConnector struct { *metadataStore.PostgresMetadata - config *protos.MySqlConfig - conn *client.Conn - syncer *replication.BinlogSyncer - streamer *replication.BinlogStreamer + config *protos.MySqlConfig + conn *client.Conn + syncer *replication.BinlogSyncer } func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlConnector, error) { syncer := replication.NewBinlogSyncer(replication.BinlogSyncerConfig{ - ServerID: 1729, // TODO put in config - Flavor: "mysql", // TODO put in config - Host: config.Host, - Port: uint16(config.Port), - User: config.User, - Password: config.Password, + ServerID: 1729, // TODO put in config + Flavor: config.Flavor, + Host: config.Host, + Port: uint16(config.Port), + User: config.User, + Password: config.Password, + UseDecimal: true, }) return &MySqlConnector{ config: config, diff --git a/flow/connectors/postgres/cdc.go b/flow/connectors/postgres/cdc.go index 2547e901c6..8a5cdd9160 100644 --- a/flow/connectors/postgres/cdc.go +++ b/flow/connectors/postgres/cdc.go @@ -394,7 +394,7 @@ func PullCdcRecords[Items model.Items]( if pkmRequiresResponse { if cdcRecordsStorage.IsEmpty() && int64(clientXLogPos) > req.ConsumedOffset.Load() { metadata := connmetadata.NewPostgresMetadataFromCatalog(logger, p.catalogPool) - if err := metadata.SetLastOffset(ctx, req.FlowJobName, int64(clientXLogPos)); err != nil { + if err := metadata.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: int64(clientXLogPos)}); err != nil { return err } req.ConsumedOffset.Store(int64(clientXLogPos)) @@ -633,7 +633,7 @@ func PullCdcRecords[Items model.Items]( if cdcRecordsStorage.IsEmpty() { if int64(clientXLogPos) > req.ConsumedOffset.Load() { metadata := connmetadata.NewPostgresMetadataFromCatalog(logger, p.catalogPool) - if err := metadata.SetLastOffset(ctx, req.FlowJobName, int64(clientXLogPos)); err != nil { + if err := metadata.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: int64(clientXLogPos)}); err != nil { return err } req.ConsumedOffset.Store(int64(clientXLogPos)) @@ -689,7 +689,7 @@ func processMessage[Items model.Items]( case *pglogrepl.CommitMessage: // for a commit message, update the last checkpoint id for the record batch. logger.Debug("CommitMessage", slog.Any("CommitLSN", msg.CommitLSN), slog.Any("TransactionEndLSN", msg.TransactionEndLSN)) - batch.UpdateLatestCheckpoint(int64(msg.CommitLSN)) + batch.UpdateLatestCheckpointID(int64(msg.CommitLSN)) p.commitLock = nil case *pglogrepl.RelationMessage: // treat all relation messages as corresponding to parent if partitioned. @@ -715,7 +715,7 @@ func processMessage[Items model.Items]( slog.String("Prefix", msg.Prefix), slog.String("LSN", msg.LSN.String())) if !msg.Transactional { - batch.UpdateLatestCheckpoint(int64(msg.LSN)) + batch.UpdateLatestCheckpointID(int64(msg.LSN)) } return &model.MessageRecord[Items]{ BaseRecord: p.baseRecord(msg.LSN), diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go index 1d7d7d589f..009828d288 100644 --- a/flow/connectors/postgres/client.go +++ b/flow/connectors/postgres/client.go @@ -25,7 +25,7 @@ import ( const ( mirrorJobsTableIdentifier = "peerdb_mirror_jobs" createMirrorJobsTableSQL = `CREATE TABLE IF NOT EXISTS %s.%s(mirror_job_name TEXT PRIMARY KEY, - lsn_offset BIGINT NOT NULL,sync_batch_id BIGINT NOT NULL,normalize_batch_id BIGINT NOT NULL)` + lsn_offset BIGINT NOT NULL,lsn_text TEXT NOT NULL,sync_batch_id BIGINT NOT NULL,normalize_batch_id BIGINT NOT NULL)` rawTablePrefix = "_peerdb_raw" createSchemaSQL = "CREATE SCHEMA IF NOT EXISTS %s" createRawTableSQL = `CREATE TABLE IF NOT EXISTS %s.%s(_peerdb_uid uuid NOT NULL, @@ -35,14 +35,14 @@ const ( createRawTableBatchIDIndexSQL = "CREATE INDEX IF NOT EXISTS %s_batchid_idx ON %s.%s(_peerdb_batch_id)" createRawTableDstTableIndexSQL = "CREATE INDEX IF NOT EXISTS %s_dst_table_idx ON %s.%s(_peerdb_destination_table_name)" - getLastOffsetSQL = "SELECT lsn_offset FROM %s.%s WHERE mirror_job_name=$1" - setLastOffsetSQL = "UPDATE %s.%s SET lsn_offset=GREATEST(lsn_offset, $1) WHERE mirror_job_name=$2" + getLastOffsetSQL = "SELECT lsn_offset, lsn_text FROM %s.%s WHERE mirror_job_name=$1" + setLastOffsetSQL = "UPDATE %s.%s SET lsn_offset=GREATEST(lsn_offset, $1), lsn_text = $2 WHERE mirror_job_name=$3" getLastSyncBatchID_SQL = "SELECT sync_batch_id FROM %s.%s WHERE mirror_job_name=$1" getLastNormalizeBatchID_SQL = "SELECT normalize_batch_id FROM %s.%s WHERE mirror_job_name=$1" createNormalizedTableSQL = "CREATE TABLE IF NOT EXISTS %s(%s)" checkTableExistsSQL = "SELECT EXISTS (SELECT 1 FROM pg_catalog.pg_tables WHERE schemaname = $1 AND tablename = $2)" - upsertJobMetadataForSyncSQL = `INSERT INTO %s.%s AS j VALUES ($1,$2,$3,$4) - ON CONFLICT(mirror_job_name) DO UPDATE SET lsn_offset=GREATEST(j.lsn_offset, EXCLUDED.lsn_offset), sync_batch_id=EXCLUDED.sync_batch_id` + upsertJobMetadataForSyncSQL = `INSERT INTO %s.%s (mirror_job_name, lsn_offset, lsn_text, sync_batch_id) AS j VALUES ($1,$2,$3,$4) + ON CONFLICT(mirror_job_name) DO UPDATE SET lsn_offset=GREATEST(j.lsn_offset, EXCLUDED.lsn_offset), lsn_text = EXCLUDED.lsn_text, sync_batch_id=EXCLUDED.sync_batch_id` checkIfJobMetadataExistsSQL = "SELECT COUNT(1)::TEXT::BOOL FROM %s.%s WHERE mirror_job_name=$1" updateMetadataForNormalizeRecordsSQL = "UPDATE %s.%s SET normalize_batch_id=$1 WHERE mirror_job_name=$2" @@ -561,12 +561,12 @@ func (c *PostgresConnector) MajorVersion(ctx context.Context) (shared.PGVersion, return c.pgVersion, nil } -func (c *PostgresConnector) updateSyncMetadata(ctx context.Context, flowJobName string, lastCP int64, syncBatchID int64, +func (c *PostgresConnector) updateSyncMetadata(ctx context.Context, flowJobName string, lastCP model.CdcCheckpoint, syncBatchID int64, syncRecordsTx pgx.Tx, ) error { _, err := syncRecordsTx.Exec(ctx, fmt.Sprintf(upsertJobMetadataForSyncSQL, c.metadataSchema, mirrorJobsTableIdentifier), - flowJobName, lastCP, syncBatchID, 0) + flowJobName, lastCP.ID, lastCP.Text, syncBatchID) if err != nil { return fmt.Errorf("failed to upsert flow job status: %w", err) } diff --git a/flow/connectors/postgres/postgres.go b/flow/connectors/postgres/postgres.go index 3b7a302773..0b1940bb18 100644 --- a/flow/connectors/postgres/postgres.go +++ b/flow/connectors/postgres/postgres.go @@ -302,27 +302,27 @@ func (c *PostgresConnector) SetupMetadataTables(ctx context.Context) error { } // GetLastOffset returns the last synced offset for a job. -func (c *PostgresConnector) GetLastOffset(ctx context.Context, jobName string) (int64, error) { - var result pgtype.Int8 - err := c.conn.QueryRow(ctx, fmt.Sprintf(getLastOffsetSQL, c.metadataSchema, mirrorJobsTableIdentifier), jobName).Scan(&result) +func (c *PostgresConnector) GetLastOffset(ctx context.Context, jobName string) (model.CdcCheckpoint, error) { + var result model.CdcCheckpoint + err := c.conn.QueryRow(ctx, fmt.Sprintf(getLastOffsetSQL, c.metadataSchema, mirrorJobsTableIdentifier), jobName).Scan(&result.ID, &result.Text) if err != nil { if err == pgx.ErrNoRows { c.logger.Info("No row found, returning nil") - return 0, nil + return result, nil } - return 0, fmt.Errorf("error while reading result row: %w", err) + return result, fmt.Errorf("error while reading result row: %w", err) } - if result.Int64 == 0 { + if result.ID == 0 && result.Text == "" { c.logger.Warn("Assuming zero offset means no sync has happened") } - return result.Int64, nil + return result, nil } // SetLastOffset updates the last synced offset for a job. -func (c *PostgresConnector) SetLastOffset(ctx context.Context, jobName string, lastOffset int64) error { +func (c *PostgresConnector) SetLastOffset(ctx context.Context, jobName string, lastOffset model.CdcCheckpoint) error { _, err := c.conn. - Exec(ctx, fmt.Sprintf(setLastOffsetSQL, c.metadataSchema, mirrorJobsTableIdentifier), lastOffset, jobName) + Exec(ctx, fmt.Sprintf(setLastOffsetSQL, c.metadataSchema, mirrorJobsTableIdentifier), lastOffset.ID, lastOffset.Text, jobName) if err != nil { return fmt.Errorf("error setting last offset for job %s: %w", jobName, err) } @@ -360,7 +360,7 @@ func pullCore[Items model.Items]( defer func() { req.RecordStream.Close() if c.replState != nil { - c.replState.Offset = req.RecordStream.GetLastCheckpoint() + c.replState.Offset = req.RecordStream.GetLastCheckpoint().ID } }() @@ -447,9 +447,9 @@ func pullCore[Items model.Items]( return nil } -func (c *PostgresConnector) UpdateReplStateLastOffset(lastOffset int64) { +func (c *PostgresConnector) UpdateReplStateLastOffset(lastOffset model.CdcCheckpoint) { if c.replState != nil { - c.replState.LastOffset.Store(lastOffset) + c.replState.LastOffset.Store(lastOffset.ID) } } @@ -583,27 +583,24 @@ func syncRecordsCore[Items model.Items]( // updating metadata with new offset and syncBatchID lastCP := req.Records.GetLastCheckpoint() - err = c.updateSyncMetadata(ctx, req.FlowJobName, lastCP, req.SyncBatchID, syncRecordsTx) - if err != nil { + if err := c.updateSyncMetadata(ctx, req.FlowJobName, lastCP, req.SyncBatchID, syncRecordsTx); err != nil { return nil, err } // transaction commits - err = syncRecordsTx.Commit(ctx) - if err != nil { + if err := syncRecordsTx.Commit(ctx); err != nil { return nil, err } - err = c.ReplayTableSchemaDeltas(ctx, req.Env, req.FlowJobName, req.Records.SchemaDeltas) - if err != nil { + if err := c.ReplayTableSchemaDeltas(ctx, req.Env, req.FlowJobName, req.Records.SchemaDeltas); err != nil { return nil, fmt.Errorf("failed to sync schema changes: %w", err) } return &model.SyncResponse{ - LastSyncedCheckpointID: lastCP, - NumRecordsSynced: numRecords, - CurrentSyncBatchID: req.SyncBatchID, - TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: req.Records.SchemaDeltas, + LastSyncedCheckpoint: lastCP, + NumRecordsSynced: numRecords, + CurrentSyncBatchID: req.SyncBatchID, + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } diff --git a/flow/connectors/pubsub/pubsub.go b/flow/connectors/pubsub/pubsub.go index 25b6e006b5..30b6088558 100644 --- a/flow/connectors/pubsub/pubsub.go +++ b/flow/connectors/pubsub/pubsub.go @@ -295,7 +295,7 @@ func (c *PubSubConnector) SyncRecords(ctx context.Context, req *model.SyncRecord case <-ticker.C: lastSeen := lastSeenLSN.Load() if lastSeen > req.ConsumedOffset.Load() { - if err := c.SetLastOffset(ctx, req.FlowJobName, lastSeen); err != nil { + if err := c.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: lastSeen}); err != nil { c.logger.Warn("[pubsub] SetLastOffset error", slog.Any("error", err)) } else { shared.AtomicInt64Max(req.ConsumedOffset, lastSeen) @@ -378,10 +378,10 @@ Loop: } return &model.SyncResponse{ - CurrentSyncBatchID: req.SyncBatchID, - LastSyncedCheckpointID: lastCheckpoint, - NumRecordsSynced: numRecords.Load(), - TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: req.Records.SchemaDeltas, + CurrentSyncBatchID: req.SyncBatchID, + LastSyncedCheckpoint: lastCheckpoint, + NumRecordsSynced: numRecords.Load(), + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } diff --git a/flow/connectors/s3/s3.go b/flow/connectors/s3/s3.go index e990820e88..d4abd6cc07 100644 --- a/flow/connectors/s3/s3.go +++ b/flow/connectors/s3/s3.go @@ -111,10 +111,10 @@ func (c *S3Connector) SyncRecords(ctx context.Context, req *model.SyncRecordsReq } return &model.SyncResponse{ - LastSyncedCheckpointID: lastCheckpoint, - NumRecordsSynced: int64(numRecords), - TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: req.Records.SchemaDeltas, + LastSyncedCheckpoint: lastCheckpoint, + NumRecordsSynced: int64(numRecords), + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } diff --git a/flow/connectors/snowflake/snowflake.go b/flow/connectors/snowflake/snowflake.go index 04840dd165..dc84ae0bb5 100644 --- a/flow/connectors/snowflake/snowflake.go +++ b/flow/connectors/snowflake/snowflake.go @@ -423,7 +423,7 @@ func (c *SnowflakeConnector) SyncRecords(ctx context.Context, req *model.SyncRec return nil, err } - if err := c.FinishBatch(ctx, req.FlowJobName, req.SyncBatchID, res.LastSyncedCheckpointID); err != nil { + if err := c.FinishBatch(ctx, req.FlowJobName, req.SyncBatchID, res.LastSyncedCheckpoint); err != nil { return nil, err } @@ -466,11 +466,11 @@ func (c *SnowflakeConnector) syncRecordsViaAvro( } return &model.SyncResponse{ - LastSyncedCheckpointID: req.Records.GetLastCheckpoint(), - NumRecordsSynced: int64(numRecords), - CurrentSyncBatchID: syncBatchID, - TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: req.Records.SchemaDeltas, + LastSyncedCheckpoint: req.Records.GetLastCheckpoint(), + NumRecordsSynced: int64(numRecords), + CurrentSyncBatchID: syncBatchID, + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, }, nil } diff --git a/flow/connectors/utils/monitoring/monitoring.go b/flow/connectors/utils/monitoring/monitoring.go index beddcfa56c..c33bdc6f0f 100644 --- a/flow/connectors/utils/monitoring/monitoring.go +++ b/flow/connectors/utils/monitoring/monitoring.go @@ -78,11 +78,11 @@ func UpdateNumRowsAndEndLSNForCDCBatch( flowJobName string, batchID int64, numRows uint32, - batchEndLSN int64, + batchEndCheckpoint model.CdcCheckpoint, ) error { _, err := pool.Exec(ctx, - "UPDATE peerdb_stats.cdc_batches SET rows_in_batch=$1,batch_end_lsn=$2 WHERE flow_name=$3 AND batch_id=$4", - numRows, uint64(batchEndLSN), flowJobName, batchID) + "UPDATE peerdb_stats.cdc_batches SET rows_in_batch=$1,batch_end_lsn=$2,batch_end_lsn_text=$3 WHERE flow_name=$4 AND batch_id=$5", + numRows, uint64(batchEndCheckpoint.ID), batchEndCheckpoint.Text, flowJobName, batchID) if err != nil { return fmt.Errorf("error while updating batch in cdc_batch: %w", err) } diff --git a/flow/model/cdc_stream.go b/flow/model/cdc_stream.go index 6184fcc5b7..94baec99da 100644 --- a/flow/model/cdc_stream.go +++ b/flow/model/cdc_stream.go @@ -3,7 +3,6 @@ package model import ( "context" "log/slog" - "sync/atomic" "time" "github.com/PeerDB-io/peerdb/flow/generated/protos" @@ -15,40 +14,52 @@ type CDCStream[T Items] struct { emptySignal chan bool records chan Record[T] // Schema changes from slot - SchemaDeltas []*protos.TableSchemaDelta - lastCheckpointSet bool - needsNormalize atomic.Bool + SchemaDeltas []*protos.TableSchemaDelta + // lastCheckpointText is used for mysql GTID + lastCheckpointText string // lastCheckpointID is the last ID of the commit that corresponds to this batch. - lastCheckpointID atomic.Int64 + lastCheckpointID int64 + lastCheckpointSet bool + needsNormalize bool +} + +type CdcCheckpoint struct { + Text string + ID int64 } func NewCDCStream[T Items](channelBuffer int) *CDCStream[T] { return &CDCStream[T]{ - records: make(chan Record[T], channelBuffer), - SchemaDeltas: make([]*protos.TableSchemaDelta, 0), - emptySignal: make(chan bool, 1), - lastCheckpointSet: false, - lastCheckpointID: atomic.Int64{}, - needsNormalize: atomic.Bool{}, + records: make(chan Record[T], channelBuffer), + SchemaDeltas: make([]*protos.TableSchemaDelta, 0), + emptySignal: make(chan bool, 1), + lastCheckpointID: 0, + lastCheckpointText: "", + needsNormalize: false, + lastCheckpointSet: false, } } -func (r *CDCStream[T]) UpdateLatestCheckpoint(val int64) { - shared.AtomicInt64Max(&r.lastCheckpointID, val) +func (r *CDCStream[T]) UpdateLatestCheckpointID(val int64) { + r.lastCheckpointID = max(r.lastCheckpointID, val) +} + +func (r *CDCStream[T]) UpdateLatestCheckpointText(val string) { + r.lastCheckpointText = val } -func (r *CDCStream[T]) GetLastCheckpoint() int64 { +func (r *CDCStream[T]) GetLastCheckpoint() CdcCheckpoint { if !r.lastCheckpointSet { panic("last checkpoint not set, stream is still active") } - return r.lastCheckpointID.Load() + return CdcCheckpoint{ID: r.lastCheckpointID, Text: r.lastCheckpointText} } func (r *CDCStream[T]) AddRecord(ctx context.Context, record Record[T]) error { - if !r.needsNormalize.Load() { + if !r.needsNormalize { switch record.(type) { case *InsertRecord[T], *UpdateRecord[T], *DeleteRecord[T]: - r.needsNormalize.Store(true) + r.needsNormalize = true } } @@ -102,5 +113,5 @@ func (r *CDCStream[T]) AddSchemaDelta( } func (r *CDCStream[T]) NeedsNormalize() bool { - return r.needsNormalize.Load() + return r.needsNormalize } diff --git a/flow/model/model.go b/flow/model/model.go index f5fdb54fab..f42b069ea8 100644 --- a/flow/model/model.go +++ b/flow/model/model.go @@ -75,7 +75,7 @@ type PullRecordsRequest[T Items] struct { // override replication slot name OverrideReplicationSlotName string // LastOffset is the latest LSN that was synced. - LastOffset int64 + LastOffset CdcCheckpoint // MaxBatchSize is the max number of records to fetch. MaxBatchSize uint32 // IdleTimeout is the timeout to wait for new records. @@ -162,8 +162,8 @@ type SyncResponse struct { TableNameRowsMapping map[string]*RecordTypeCounts // to be carried to parent workflow TableSchemaDeltas []*protos.TableSchemaDelta - // LastSyncedCheckpointID is the last ID that was synced. - LastSyncedCheckpointID int64 + // LastSyncedCheckpoint is the last state (eg LSN, GTID) that was synced. + LastSyncedCheckpoint CdcCheckpoint // NumRecordsSynced is the number of records that were synced. NumRecordsSynced int64 CurrentSyncBatchID int64 diff --git a/flow/pua/stream_adapter.go b/flow/pua/stream_adapter.go index 7f4c5ffcfc..6913ea40fb 100644 --- a/flow/pua/stream_adapter.go +++ b/flow/pua/stream_adapter.go @@ -76,7 +76,9 @@ func AttachToCdcStream( } } outstream.SchemaDeltas = stream.SchemaDeltas - outstream.UpdateLatestCheckpoint(stream.GetLastCheckpoint()) + lastCP := stream.GetLastCheckpoint() + outstream.UpdateLatestCheckpointID(lastCP.ID) + outstream.UpdateLatestCheckpointText(lastCP.Text) outstream.Close() }() return outstream diff --git a/nexus/catalog/migrations/V42__mysql_metadata.sql b/nexus/catalog/migrations/V42__mysql_metadata.sql index fe128403ec..e5558667e1 100644 --- a/nexus/catalog/migrations/V42__mysql_metadata.sql +++ b/nexus/catalog/migrations/V42__mysql_metadata.sql @@ -1,10 +1,2 @@ -CREATE TABLE IF NOT EXISTS metadata_mysql_sync_state ( - job_name TEXT PRIMARY KEY NOT NULL, - pos_file text, - pos_offset int, - gtid text, - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - sync_batch_id BIGINT NOT NULL, - normalize_batch_id BIGINT -); +ALTER TABLE metadata_last_sync_state ADD COLUMN IF NOT EXISTS last_text text NOT NULL DEFAULT ''; From 2cd88bb8ab8bb979da6df00f7dfb8a0b61dc6da9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Mon, 16 Dec 2024 16:46:16 +0000 Subject: [PATCH 04/80] filling out logic to take in row --- flow/connectors/mysql/cdc.go | 113 +++++++++++++++++++++++++++------ flow/connectors/mysql/mysql.go | 5 +- 2 files changed, 98 insertions(+), 20 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index b2c17d7f17..ee6a2c3eda 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -2,10 +2,14 @@ package connmysql import ( "context" + "errors" + "fmt" + "time" "github.com/go-mysql-org/go-mysql/mysql" "github.com/go-mysql-org/go-mysql/replication" "github.com/jackc/pgx/v5/pgxpool" + "github.com/shopspring/decimal" "github.com/PeerDB-io/peerdb/flow/alerting" "github.com/PeerDB-io/peerdb/flow/generated/protos" @@ -82,8 +86,60 @@ func (c *MySqlConnector) RemoveTablesFromPublication(ctx context.Context, req *p return nil } -func qvalueFromMysql(typ byte, val any) qvalue.QValue { - // TODO +func qvalueFromMysql(mytype byte, qkind qvalue.QValueKind, val any) qvalue.QValue { + // TODO signedness, in ev.Table, need to extend QValue system + // See go-mysql row_event.go for mapping + switch val := val.(type) { + case nil: + return qvalue.QValueNull(qkind) + case int8: // TODO qvalue.Int8 + return qvalue.QValueInt16{Val: int16(val)} + case int16: + return qvalue.QValueInt16{Val: val} + case int32: + return qvalue.QValueInt32{Val: val} + case int64: + return qvalue.QValueInt64{Val: val} + case float32: + return qvalue.QValueFloat32{Val: val} + case float64: + return qvalue.QValueFloat64{Val: val} + case decimal.Decimal: + return qvalue.QValueNumeric{Val: val} + case int: + // YEAR: https://dev.mysql.com/doc/refman/8.4/en/year.html + return qvalue.QValueInt16{Val: int16(val)} + case time.Time: + return qvalue.QValueTimestamp{Val: val} + case *replication.JsonDiff: + // TODO support somehow?? + return qvalue.QValueNull(qvalue.QValueKindJSON) + case []byte: + switch mytype { + case mysql.MYSQL_TYPE_BLOB: + return qvalue.QValueBytes{Val: val} + case mysql.MYSQL_TYPE_JSON: + return qvalue.QValueJSON{Val: string(val)} + case mysql.MYSQL_TYPE_GEOMETRY: + // TODO figure out mysql geo encoding + return qvalue.QValueGeometry{Val: string(val)} + } + case string: + switch mytype { + case mysql.MYSQL_TYPE_TIME: + // TODO parse + case mysql.MYSQL_TYPE_TIME2: + // TODO parse + case mysql.MYSQL_TYPE_DATE: + // TODO parse + case mysql.MYSQL_TYPE_VARCHAR, + mysql.MYSQL_TYPE_VAR_STRING, + mysql.MYSQL_TYPE_STRING: + return qvalue.QValueString{Val: val} + } + default: + panic(fmt.Sprintf("unexpected type %T for mysql type %d", val, mytype)) + } return nil } @@ -112,34 +168,55 @@ func (c *MySqlConnector) PullRecords( } switch ev := event.Event.(type) { case *replication.RowsEvent: + sourceTableName := string(ev.Table.Table) // TODO need ev.Table.Schema? + destinationTableName := req.TableNameMapping[sourceTableName].Name + schema := req.TableNameSchemaMapping[destinationTableName] for _, row := range ev.Rows { var record model.Record[model.RecordItems] - //TODO need tableNameMapping[source] -> destination //TODO need mapping of column index to column name var items model.RecordItems switch event.Header.EventType { - case replication.WRITE_ROWS_EVENTv0, replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2: + case replication.WRITE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv0: + return errors.New("mysql v0 replication protocol not supported") + case replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2: for idx, val := range row { - // TODO - items.AddColumn("ColumnName", qvalueFromMysql(ev.Table.ColumnType[idx], val)) + fd := schema.Columns[idx] + items.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } record = &model.InsertRecord[model.RecordItems]{ - BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, - Items: items, - SourceTableName: string(ev.Table.Table), + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + Items: items, + SourceTableName: sourceTableName, + DestinationTableName: destinationTableName, + } + case replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2: + var oldItems model.RecordItems + for idx, val := range row { + fd := schema.Columns[idx>>1] + qv := qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val) + if (idx & 1) == 0 { // TODO test that it isn't other way around + oldItems.AddColumn(fd.Name, qv) + } else { + items.AddColumn(fd.Name, qv) + } } - case replication.UPDATE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2: - // TODO no OldItems / NewItems. How does primary key update work? record = &model.UpdateRecord[model.RecordItems]{ - BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, - NewItems: items, - SourceTableName: string(ev.Table.Table), + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + OldItems: oldItems, + NewItems: items, + SourceTableName: sourceTableName, + DestinationTableName: destinationTableName, + } + case replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2: + for idx, val := range row { + fd := schema.Columns[idx] + items.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } - case replication.DELETE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2: record = &model.DeleteRecord[model.RecordItems]{ - BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, - Items: items, - SourceTableName: string(ev.Table.Table), + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + Items: items, + SourceTableName: sourceTableName, + DestinationTableName: destinationTableName, } default: continue diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 3cb21b1bcb..c1d297ac5e 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -32,6 +32,7 @@ func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlC User: config.User, Password: config.Password, UseDecimal: true, + ParseTime: true, }) return &MySqlConnector{ config: config, @@ -121,11 +122,11 @@ func (c *MySqlConnector) GetMasterGTIDSet(ctx context.Context) (mysql.GTIDSet, e } rr, err := c.Execute(ctx, query) if err != nil { - return nil, fmt.Errorf("failed to SELECT @@GLOBAL.GTID_EXECUTED", err) + return nil, fmt.Errorf("failed to SELECT @@GLOBAL.GTID_EXECUTED: %w", err) } gx, err := rr.GetString(0, 0) if err != nil { - return nil, fmt.Errorf("failed to GetString for GTID_EXECUTED", err) + return nil, fmt.Errorf("failed to GetString for GTID_EXECUTED: %w", err) } gset, err := mysql.ParseGTIDSet(c.config.Flavor, gx) if err != nil { From 7edd0fee303505fb86ca0385e1fc086de23699f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 25 Dec 2024 17:46:00 +0000 Subject: [PATCH 05/80] fix lints --- flow/activities/flowable_core.go | 10 +++++----- flow/connectors/mysql/cdc.go | 17 +++++++++-------- flow/connectors/mysql/mysql.go | 2 +- flow/connectors/postgres/cdc.go | 4 ++-- flow/connectors/postgres/client.go | 3 ++- flow/connectors/postgres/postgres.go | 7 ++++--- flow/model/cdc_stream.go | 4 ++-- flow/workflows/setup_flow.go | 10 ++++++---- 8 files changed, 31 insertions(+), 26 deletions(-) diff --git a/flow/activities/flowable_core.go b/flow/activities/flowable_core.go index 9eaf316a61..f759d4de1a 100644 --- a/flow/activities/flowable_core.go +++ b/flow/activities/flowable_core.go @@ -133,10 +133,10 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon batchSize = 250_000 } - lastOffset, err := func() (int64, error) { + lastOffset, err := func() (model.CdcCheckpoint, error) { dstConn, err := connectors.GetByNameAs[TSync](ctx, config.Env, a.CatalogPool, config.DestinationName) if err != nil { - return 0, fmt.Errorf("failed to get destination connector: %w", err) + return model.CdcCheckpoint{}, fmt.Errorf("failed to get destination connector: %w", err) } defer connectors.CloseConnector(ctx, dstConn) @@ -147,9 +147,9 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon return nil, err } - logger.Info("pulling records...", slog.Int64("LastOffset", lastOffset)) + logger.Info("pulling records...", slog.Any("LastOffset", lastOffset)) consumedOffset := atomic.Int64{} - consumedOffset.Store(lastOffset) + consumedOffset.Store(lastOffset.ID) channelBufferSize, err := peerdbenv.PeerDBCDCChannelBufferSize(ctx, config.Env) if err != nil { @@ -296,7 +296,7 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon return nil, err } - if err := monitoring.UpdateLatestLSNAtTargetForCDCFlow(ctx, a.CatalogPool, flowName, lastCheckpoint); err != nil { + if err := monitoring.UpdateLatestLSNAtTargetForCDCFlow(ctx, a.CatalogPool, flowName, lastCheckpoint.ID); err != nil { a.Alerter.LogFlowError(ctx, flowName, err) return nil, err } diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index ee6a2c3eda..3a37071bca 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -27,8 +27,11 @@ func (c *MySqlConnector) GetTableSchema( panic("TODO") } -func (c *MySqlConnector) EnsurePullability(ctx context.Context, req *protos.EnsurePullabilityBatchInput) ( - *protos.EnsurePullabilityBatchOutput, error) +func (c *MySqlConnector) EnsurePullability( + ctx context.Context, req *protos.EnsurePullabilityBatchInput, +) (*protos.EnsurePullabilityBatchOutput, error) { + return nil, nil +} func (c *MySqlConnector) ExportTxSnapshot(context.Context) (*protos.ExportTxSnapshotOutput, any, error) { // https://dev.mysql.com/doc/refman/8.4/en/replication-howto-masterstatus.html @@ -44,6 +47,7 @@ func (c *MySqlConnector) SetupReplConn(context.Context) error { return nil } +//nolint:unused func (c *MySqlConnector) startCdcStreamingFilePos(lastOffsetName string, lastOffsetPos uint32) (*replication.BinlogStreamer, error) { return c.syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) } @@ -166,14 +170,13 @@ func (c *MySqlConnector) PullRecords( if err != nil { return err } - switch ev := event.Event.(type) { - case *replication.RowsEvent: + if ev, ok := event.Event.(*replication.RowsEvent); ok { sourceTableName := string(ev.Table.Table) // TODO need ev.Table.Schema? destinationTableName := req.TableNameMapping[sourceTableName].Name schema := req.TableNameSchemaMapping[destinationTableName] for _, row := range ev.Rows { var record model.Record[model.RecordItems] - //TODO need mapping of column index to column name + // TODO need mapping of column index to column name var items model.RecordItems switch event.Header.EventType { case replication.WRITE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv0: @@ -221,12 +224,10 @@ func (c *MySqlConnector) PullRecords( default: continue } - err := req.RecordStream.AddRecord(ctx, record) - if err != nil { + if err := req.RecordStream.AddRecord(ctx, record); err != nil { return err } } - break } break // TODO when batch ready } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index c1d297ac5e..2156394abf 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -113,7 +113,7 @@ func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, erro } func (c *MySqlConnector) GetMasterGTIDSet(ctx context.Context) (mysql.GTIDSet, error) { - query := "" + var query string switch c.config.Flavor { case mysql.MariaDBFlavor: query = "SELECT @@GLOBAL.gtid_current_pos" diff --git a/flow/connectors/postgres/cdc.go b/flow/connectors/postgres/cdc.go index 8a5cdd9160..b28fa6cf7a 100644 --- a/flow/connectors/postgres/cdc.go +++ b/flow/connectors/postgres/cdc.go @@ -326,8 +326,8 @@ func PullCdcRecords[Items model.Items]( // clientXLogPos is the last checkpoint id, we need to ack that we have processed // until clientXLogPos each time we send a standby status update. var clientXLogPos pglogrepl.LSN - if req.LastOffset > 0 { - clientXLogPos = pglogrepl.LSN(req.LastOffset) + if req.LastOffset.ID > 0 { + clientXLogPos = pglogrepl.LSN(req.LastOffset.ID) if err := sendStandbyAfterReplLock("initial-flush"); err != nil { return err } diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go index 009828d288..a65fc588c1 100644 --- a/flow/connectors/postgres/client.go +++ b/flow/connectors/postgres/client.go @@ -42,7 +42,8 @@ const ( createNormalizedTableSQL = "CREATE TABLE IF NOT EXISTS %s(%s)" checkTableExistsSQL = "SELECT EXISTS (SELECT 1 FROM pg_catalog.pg_tables WHERE schemaname = $1 AND tablename = $2)" upsertJobMetadataForSyncSQL = `INSERT INTO %s.%s (mirror_job_name, lsn_offset, lsn_text, sync_batch_id) AS j VALUES ($1,$2,$3,$4) - ON CONFLICT(mirror_job_name) DO UPDATE SET lsn_offset=GREATEST(j.lsn_offset, EXCLUDED.lsn_offset), lsn_text = EXCLUDED.lsn_text, sync_batch_id=EXCLUDED.sync_batch_id` + ON CONFLICT(mirror_job_name) DO UPDATE SET lsn_offset=GREATEST(j.lsn_offset, EXCLUDED.lsn_offset), + lsn_text = EXCLUDED.lsn_text, sync_batch_id=EXCLUDED.sync_batch_id` checkIfJobMetadataExistsSQL = "SELECT COUNT(1)::TEXT::BOOL FROM %s.%s WHERE mirror_job_name=$1" updateMetadataForNormalizeRecordsSQL = "UPDATE %s.%s SET normalize_batch_id=$1 WHERE mirror_job_name=$2" diff --git a/flow/connectors/postgres/postgres.go b/flow/connectors/postgres/postgres.go index 0b1940bb18..8c7505ed92 100644 --- a/flow/connectors/postgres/postgres.go +++ b/flow/connectors/postgres/postgres.go @@ -304,8 +304,9 @@ func (c *PostgresConnector) SetupMetadataTables(ctx context.Context) error { // GetLastOffset returns the last synced offset for a job. func (c *PostgresConnector) GetLastOffset(ctx context.Context, jobName string) (model.CdcCheckpoint, error) { var result model.CdcCheckpoint - err := c.conn.QueryRow(ctx, fmt.Sprintf(getLastOffsetSQL, c.metadataSchema, mirrorJobsTableIdentifier), jobName).Scan(&result.ID, &result.Text) - if err != nil { + if err := c.conn.QueryRow( + ctx, fmt.Sprintf(getLastOffsetSQL, c.metadataSchema, mirrorJobsTableIdentifier), jobName, + ).Scan(&result.ID, &result.Text); err != nil { if err == pgx.ErrNoRows { c.logger.Info("No row found, returning nil") return result, nil @@ -405,7 +406,7 @@ func pullCore[Items model.Items]( return fmt.Errorf("error getting child to parent relid map: %w", err) } - if err := c.MaybeStartReplication(ctx, slotName, publicationName, req.LastOffset, pgVersion); err != nil { + if err := c.MaybeStartReplication(ctx, slotName, publicationName, req.LastOffset.ID, pgVersion); err != nil { // in case of Aurora error ERROR: replication slots cannot be used on RO (Read Only) node (SQLSTATE 55000) if shared.IsSQLStateError(err, pgerrcode.ObjectNotInPrerequisiteState) && strings.Contains(err.Error(), "replication slots cannot be used on RO (Read Only) node") { diff --git a/flow/model/cdc_stream.go b/flow/model/cdc_stream.go index 94baec99da..ec1aef89bb 100644 --- a/flow/model/cdc_stream.go +++ b/flow/model/cdc_stream.go @@ -13,10 +13,10 @@ type CDCStream[T Items] struct { // empty signal to indicate if the records are going to be empty or not. emptySignal chan bool records chan Record[T] - // Schema changes from slot - SchemaDeltas []*protos.TableSchemaDelta // lastCheckpointText is used for mysql GTID lastCheckpointText string + // Schema changes from slot + SchemaDeltas []*protos.TableSchemaDelta // lastCheckpointID is the last ID of the commit that corresponds to this batch. lastCheckpointID int64 lastCheckpointSet bool diff --git a/flow/workflows/setup_flow.go b/flow/workflows/setup_flow.go index 5c666ac537..26ce9bafa9 100644 --- a/flow/workflows/setup_flow.go +++ b/flow/workflows/setup_flow.go @@ -119,15 +119,12 @@ func (s *SetupFlowExecution) ensurePullability( InitialInterval: 1 * time.Minute, }, }) - srcTableIdNameMapping := make(map[uint32]string) - - srcTblIdentifiers := slices.Sorted(maps.Keys(s.tableNameMapping)) // create EnsurePullabilityInput for the srcTableName ensurePullabilityInput := &protos.EnsurePullabilityBatchInput{ PeerName: config.SourceName, FlowJobName: s.cdcFlowName, - SourceTableIdentifiers: srcTblIdentifiers, + SourceTableIdentifiers: slices.Sorted(maps.Keys(s.tableNameMapping)), CheckConstraints: checkConstraints, } @@ -138,8 +135,13 @@ func (s *SetupFlowExecution) ensurePullability( return nil, fmt.Errorf("failed to ensure pullability for tables: %w", err) } + if ensurePullabilityOutput == nil { + return nil, nil + } + sortedTableNames := slices.Sorted(maps.Keys(ensurePullabilityOutput.TableIdentifierMapping)) + srcTableIdNameMapping := make(map[uint32]string, len(sortedTableNames)) for _, tableName := range sortedTableNames { tableIdentifier := ensurePullabilityOutput.TableIdentifierMapping[tableName] srcTableIdNameMapping[tableIdentifier.RelId] = tableName From 2f1d4477a1e8bc25bbf5b0a872659c46c0c32c64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 25 Dec 2024 18:27:42 +0000 Subject: [PATCH 06/80] lift logic out of connector into activity, will make easier to implement for new connectors --- flow/activities/flowable.go | 23 +++++++++++++++++++++-- flow/connectors/postgres/qrep.go | 27 +++------------------------ 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go index 48c10cc698..7c0058a5b5 100644 --- a/flow/activities/flowable.go +++ b/flow/activities/flowable.go @@ -6,6 +6,7 @@ import ( "fmt" "log/slog" "sync/atomic" + "time" "github.com/jackc/pgerrcode" "github.com/jackc/pgx/v5" @@ -806,12 +807,30 @@ func (a *FlowableActivity) QRepHasNewRows(ctx context.Context, logger.Info(fmt.Sprintf("current last partition value is %v", last)) - result, err := srcConn.CheckForUpdatedMaxValue(ctx, config, last) + maxValue, err := srcConn.GetMaxValue(ctx, config, last) if err != nil { a.Alerter.LogFlowError(ctx, config.FlowJobName, err) return false, fmt.Errorf("failed to check for new rows: %w", err) } - return result, nil + + if maxValue == nil || last == nil || last.Range == nil { + return maxValue != nil, nil + } + + switch x := last.Range.Range.(type) { + case *protos.PartitionRange_IntRange: + if maxValue.(int64) > x.IntRange.End { + return true, nil + } + case *protos.PartitionRange_TimestampRange: + if maxValue.(time.Time).After(x.TimestampRange.End.AsTime()) { + return true, nil + } + default: + return false, fmt.Errorf("unknown range type: %v", x) + } + + return false, nil } func (a *FlowableActivity) RenameTables(ctx context.Context, config *protos.RenameTablesInput) (*protos.RenameTablesOutput, error) { diff --git a/flow/connectors/postgres/qrep.go b/flow/connectors/postgres/qrep.go index ec45e98839..22fb456ea4 100644 --- a/flow/connectors/postgres/qrep.go +++ b/flow/connectors/postgres/qrep.go @@ -264,11 +264,11 @@ func (c *PostgresConnector) getMinMaxValues( return minValue, maxValue, nil } -func (c *PostgresConnector) CheckForUpdatedMaxValue( +func (c *PostgresConnector) GetMaxValue( ctx context.Context, config *protos.QRepConfig, last *protos.QRepPartition, -) (bool, error) { +) (any, error) { checkTx, err := c.conn.Begin(ctx) if err != nil { return false, fmt.Errorf("unable to begin transaction for getting max value: %w", err) @@ -276,28 +276,7 @@ func (c *PostgresConnector) CheckForUpdatedMaxValue( defer shared.RollbackTx(checkTx, c.logger) _, maxValue, err := c.getMinMaxValues(ctx, checkTx, config, last) - if err != nil { - return false, fmt.Errorf("error while getting min and max values: %w", err) - } - - if maxValue == nil || last == nil || last.Range == nil { - return maxValue != nil, nil - } - - switch x := last.Range.Range.(type) { - case *protos.PartitionRange_IntRange: - if maxValue.(int64) > x.IntRange.End { - return true, nil - } - case *protos.PartitionRange_TimestampRange: - if maxValue.(time.Time).After(x.TimestampRange.End.AsTime()) { - return true, nil - } - default: - return false, fmt.Errorf("unknown range type: %v", x) - } - - return false, nil + return maxValue, err } func (c *PostgresConnector) PullQRepRecords( From fb82b3e090b71bad94bb0d14a97dcef83973610b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 25 Dec 2024 20:51:31 +0000 Subject: [PATCH 07/80] mysql: GetVersionConnector --- flow/connectors/core.go | 3 +-- flow/connectors/mysql/mysql.go | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/flow/connectors/core.go b/flow/connectors/core.go index 294eeeb62a..9450667ac1 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -518,6 +518,5 @@ var ( _ GetVersionConnector = &connclickhouse.ClickHouseConnector{} _ GetVersionConnector = &connpostgres.PostgresConnector{} - - _ Connector = &connmysql.MySqlConnector{} + _ GetVersionConnector = &connmysql.MySqlConnector{} ) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 2156394abf..5975d4c1d3 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -6,14 +6,17 @@ import ( "context" "crypto/tls" "fmt" + "log/slog" "time" "github.com/go-mysql-org/go-mysql/client" "github.com/go-mysql-org/go-mysql/mysql" "github.com/go-mysql-org/go-mysql/replication" + "go.temporal.io/sdk/log" metadataStore "github.com/PeerDB-io/peerdb/flow/connectors/external_metadata" "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/shared" ) type MySqlConnector struct { @@ -21,6 +24,7 @@ type MySqlConnector struct { config *protos.MySqlConfig conn *client.Conn syncer *replication.BinlogSyncer + logger log.Logger } func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlConnector, error) { @@ -37,6 +41,7 @@ func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlC return &MySqlConnector{ config: config, syncer: syncer, + logger: shared.LoggerFromCtx(ctx), }, nil } @@ -116,21 +121,32 @@ func (c *MySqlConnector) GetMasterGTIDSet(ctx context.Context) (mysql.GTIDSet, e var query string switch c.config.Flavor { case mysql.MariaDBFlavor: - query = "SELECT @@GLOBAL.gtid_current_pos" + query = "select @@global.gtid_current_pos" default: - query = "SELECT @@GLOBAL.GTID_EXECUTED" + query = "select @@global.gtid_executed" } rr, err := c.Execute(ctx, query) if err != nil { - return nil, fmt.Errorf("failed to SELECT @@GLOBAL.GTID_EXECUTED: %w", err) + return nil, fmt.Errorf("failed to select @@global.gtid_executed: %w", err) } gx, err := rr.GetString(0, 0) if err != nil { - return nil, fmt.Errorf("failed to GetString for GTID_EXECUTED: %w", err) + return nil, fmt.Errorf("failed to GetString for gtid_executed: %w", err) } gset, err := mysql.ParseGTIDSet(c.config.Flavor, gx) if err != nil { - return nil, fmt.Errorf("failed to parse GTID from GTID_EXECUTED: %w", err) + return nil, fmt.Errorf("failed to parse GTID from gtid_executed: %w", err) } return gset, nil } + +func (c *MySqlConnector) GetVersion(ctx context.Context) (string, error) { + rr, err := c.Execute(ctx, "select @@version") + if err != nil { + return "", err + } + version, _ := rr.GetString(0, 0) + c.logger.Info("[mysql] version", slog.String("version", version)) + return version, nil + +} From c7ff51dbc15e3c4e114616de8d4cfca2eeb3376c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 25 Dec 2024 21:23:37 +0000 Subject: [PATCH 08/80] mysql fetched bytes counter --- flow/connectors/mysql/cdc.go | 29 +++- flow/connectors/mysql/mysql.go | 3 +- flow/connectors/postgres/cdc.go | 274 ++++++++++++++++---------------- 3 files changed, 161 insertions(+), 145 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 3a37071bca..be941d88a0 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -10,6 +10,8 @@ import ( "github.com/go-mysql-org/go-mysql/replication" "github.com/jackc/pgx/v5/pgxpool" "github.com/shopspring/decimal" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "github.com/PeerDB-io/peerdb/flow/alerting" "github.com/PeerDB-io/peerdb/flow/generated/protos" @@ -165,11 +167,31 @@ func (c *MySqlConnector) PullRecords( if err != nil { return err } + + var fetchedBytesCounter metric.Int64Counter + if otelManager != nil { + var err error + fetchedBytesCounter, err = otelManager.GetOrInitInt64Counter(otel_metrics.BuildMetricName(otel_metrics.FetchedBytesCounterName), + metric.WithUnit("By"), metric.WithDescription("Bytes received of CopyData over replication slot")) + if err != nil { + return fmt.Errorf("could not get FetchedBytesCounter: %w", err) + } + } + + var recordCount uint32 for { + // TODO put req.IdleTimeout timer on this event, err := mystream.GetEvent(ctx) if err != nil { return err } + + if fetchedBytesCounter != nil { + fetchedBytesCounter.Add(ctx, int64(len(event.RawData)), metric.WithAttributeSet(attribute.NewSet( + attribute.String(otel_metrics.FlowNameKey, req.FlowJobName), + ))) + } + if ev, ok := event.Event.(*replication.RowsEvent); ok { sourceTableName := string(ev.Table.Table) // TODO need ev.Table.Schema? destinationTableName := req.TableNameMapping[sourceTableName].Name @@ -224,12 +246,15 @@ func (c *MySqlConnector) PullRecords( default: continue } + recordCount += 1 if err := req.RecordStream.AddRecord(ctx, record); err != nil { return err } } } - break // TODO when batch ready + + if recordCount >= 0 && recordCount >= req.MaxBatchSize { + return nil + } } - return nil } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 5975d4c1d3..a854814535 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -29,7 +29,7 @@ type MySqlConnector struct { func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlConnector, error) { syncer := replication.NewBinlogSyncer(replication.BinlogSyncerConfig{ - ServerID: 1729, // TODO put in config + ServerID: 1729, // TODO put in config (or generate randomly, which is what go-mysql-org does) Flavor: config.Flavor, Host: config.Host, Port: uint16(config.Port), @@ -110,7 +110,6 @@ func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, erro return mysql.Position{}, fmt.Errorf("failed to SHOW BINARY LOG STATUS: %w", err) } - // TODO: check error? name, _ := rr.GetString(0, 0) pos, _ := rr.GetInt(0, 1) diff --git a/flow/connectors/postgres/cdc.go b/flow/connectors/postgres/cdc.go index b28fa6cf7a..70d0a0d82c 100644 --- a/flow/connectors/postgres/cdc.go +++ b/flow/connectors/postgres/cdc.go @@ -353,8 +353,7 @@ func PullCdcRecords[Items model.Items]( }) defer shutdown() - standbyMessageTimeout := req.IdleTimeout - nextStandbyMessageDeadline := time.Now().Add(standbyMessageTimeout) + nextStandbyMessageDeadline := time.Now().Add(req.IdleTimeout) addRecordWithKey := func(key model.TableWithPkey, rec model.Record[Items]) error { if err := cdcRecordsStorage.Set(logger, key, rec); err != nil { @@ -366,7 +365,7 @@ func PullCdcRecords[Items model.Items]( if cdcRecordsStorage.Len() == 1 { records.SignalAsNotEmpty() - nextStandbyMessageDeadline = time.Now().Add(standbyMessageTimeout) + nextStandbyMessageDeadline = time.Now().Add(req.IdleTimeout) logger.Info(fmt.Sprintf("pushing the standby deadline to %s", nextStandbyMessageDeadline)) } return nil @@ -449,7 +448,7 @@ func PullCdcRecords[Items model.Items]( p.flowJobName), ) } - nextStandbyMessageDeadline = time.Now().Add(standbyMessageTimeout) + nextStandbyMessageDeadline = time.Now().Add(req.IdleTimeout) } var receiveCtx context.Context @@ -480,166 +479,159 @@ func PullCdcRecords[Items model.Items]( } } - if errMsg, ok := rawMsg.(*pgproto3.ErrorResponse); ok { - return shared.LogError(logger, fmt.Errorf("received Postgres WAL error: %+v", errMsg)) - } - - msg, ok := rawMsg.(*pgproto3.CopyData) - if !ok { - continue - } - - if fetchedBytesCounter != nil { - fetchedBytesCounter.Add(ctx, int64(len(msg.Data)), metric.WithAttributeSet(attribute.NewSet( - attribute.String(otel_metrics.FlowNameKey, req.FlowJobName), - ))) - } - - switch msg.Data[0] { - case pglogrepl.PrimaryKeepaliveMessageByteID: - pkm, err := pglogrepl.ParsePrimaryKeepaliveMessage(msg.Data[1:]) - if err != nil { - return fmt.Errorf("ParsePrimaryKeepaliveMessage failed: %w", err) + switch msg := rawMsg.(type) { + case *pgproto3.ErrorResponse: + return shared.LogError(logger, fmt.Errorf("received Postgres WAL error: %+v", msg)) + case *pgproto3.CopyData: + if fetchedBytesCounter != nil { + fetchedBytesCounter.Add(ctx, int64(len(msg.Data)), metric.WithAttributeSet(attribute.NewSet( + attribute.String(otel_metrics.FlowNameKey, req.FlowJobName), + ))) } - logger.Debug("Primary Keepalive Message", slog.Bool("replyRequested", pkm.ReplyRequested), - slog.String("ServerWALEnd", pkm.ServerWALEnd.String()), slog.String("ServerTime", pkm.ServerTime.String())) - - if pkm.ServerWALEnd > clientXLogPos { - clientXLogPos = pkm.ServerWALEnd - } + switch msg.Data[0] { + case pglogrepl.PrimaryKeepaliveMessageByteID: + pkm, err := pglogrepl.ParsePrimaryKeepaliveMessage(msg.Data[1:]) + if err != nil { + return fmt.Errorf("ParsePrimaryKeepaliveMessage failed: %w", err) + } - if pkm.ReplyRequested || (pkmEmptyBatchThrottleThresholdSeconds != -1 && - time.Since(lastEmptyBatchPkmSentTime) >= time.Duration(pkmEmptyBatchThrottleThresholdSeconds)*time.Second) { - pkmRequiresResponse = true - } + if pkm.ServerWALEnd > clientXLogPos { + clientXLogPos = pkm.ServerWALEnd + } - case pglogrepl.XLogDataByteID: - xld, err := pglogrepl.ParseXLogData(msg.Data[1:]) - if err != nil { - return fmt.Errorf("ParseXLogData failed: %w", err) - } + if pkm.ReplyRequested || (pkmEmptyBatchThrottleThresholdSeconds != -1 && + time.Since(lastEmptyBatchPkmSentTime) >= time.Duration(pkmEmptyBatchThrottleThresholdSeconds)*time.Second) { + pkmRequiresResponse = true + } - logger.Debug("XLogData", - slog.Any("WALStart", xld.WALStart), slog.Any("ServerWALEnd", xld.ServerWALEnd), slog.Any("ServerTime", xld.ServerTime)) - rec, err := processMessage(ctx, p, records, xld, clientXLogPos, processor) - if err != nil { - return fmt.Errorf("error processing message: %w", err) - } + case pglogrepl.XLogDataByteID: + xld, err := pglogrepl.ParseXLogData(msg.Data[1:]) + if err != nil { + return fmt.Errorf("ParseXLogData failed: %w", err) + } - if xld.WALStart > clientXLogPos { - clientXLogPos = xld.WALStart - } + logger.Debug("XLogData", + slog.Any("WALStart", xld.WALStart), slog.Any("ServerWALEnd", xld.ServerWALEnd), slog.Any("ServerTime", xld.ServerTime)) + rec, err := processMessage(ctx, p, records, xld, clientXLogPos, processor) + if err != nil { + return fmt.Errorf("error processing message: %w", err) + } - if rec != nil { - tableName := rec.GetDestinationTableName() - switch r := rec.(type) { - case *model.UpdateRecord[Items]: - // tableName here is destination tableName. - // should be ideally sourceTableName as we are in PullRecords. - // will change in future - // TODO: replident is cached here, should not cache since it can change - isFullReplica := req.TableNameSchemaMapping[tableName].IsReplicaIdentityFull - if isFullReplica { - if err := addRecordWithKey(model.TableWithPkey{}, rec); err != nil { - return err - } - } else { - tablePkeyVal, err := model.RecToTablePKey(req.TableNameSchemaMapping, rec) - if err != nil { - return err - } + if xld.WALStart > clientXLogPos { + clientXLogPos = xld.WALStart + } - latestRecord, ok, err := cdcRecordsStorage.Get(tablePkeyVal) - if err != nil { - return err - } - if ok { - // iterate through unchanged toast cols and set them in new record - updatedCols := r.NewItems.UpdateIfNotExists(latestRecord.GetItems()) - for _, col := range updatedCols { - delete(r.UnchangedToastColumns, col) + if rec != nil { + tableName := rec.GetDestinationTableName() + switch r := rec.(type) { + case *model.UpdateRecord[Items]: + // tableName here is destination tableName. + // should be ideally sourceTableName as we are in PullRecords. + // will change in future + // TODO: replident is cached here, should not cache since it can change + isFullReplica := req.TableNameSchemaMapping[tableName].IsReplicaIdentityFull + if isFullReplica { + if err := addRecordWithKey(model.TableWithPkey{}, rec); err != nil { + return err + } + } else { + tablePkeyVal, err := model.RecToTablePKey(req.TableNameSchemaMapping, rec) + if err != nil { + return err } - } - if err := addRecordWithKey(tablePkeyVal, rec); err != nil { - return err - } - } - case *model.InsertRecord[Items]: - isFullReplica := req.TableNameSchemaMapping[tableName].IsReplicaIdentityFull - if isFullReplica { - if err := addRecordWithKey(model.TableWithPkey{}, rec); err != nil { - return err - } - } else { - tablePkeyVal, err := model.RecToTablePKey(req.TableNameSchemaMapping, rec) - if err != nil { - return err + latestRecord, ok, err := cdcRecordsStorage.Get(tablePkeyVal) + if err != nil { + return err + } + if ok { + // iterate through unchanged toast cols and set them in new record + updatedCols := r.NewItems.UpdateIfNotExists(latestRecord.GetItems()) + for _, col := range updatedCols { + delete(r.UnchangedToastColumns, col) + } + } + if err := addRecordWithKey(tablePkeyVal, rec); err != nil { + return err + } } - if err := addRecordWithKey(tablePkeyVal, rec); err != nil { - return err - } - } - case *model.DeleteRecord[Items]: - isFullReplica := req.TableNameSchemaMapping[tableName].IsReplicaIdentityFull - if isFullReplica { - if err := addRecordWithKey(model.TableWithPkey{}, rec); err != nil { - return err - } - } else { - tablePkeyVal, err := model.RecToTablePKey(req.TableNameSchemaMapping, rec) - if err != nil { - return err - } + case *model.InsertRecord[Items]: + isFullReplica := req.TableNameSchemaMapping[tableName].IsReplicaIdentityFull + if isFullReplica { + if err := addRecordWithKey(model.TableWithPkey{}, rec); err != nil { + return err + } + } else { + tablePkeyVal, err := model.RecToTablePKey(req.TableNameSchemaMapping, rec) + if err != nil { + return err + } - latestRecord, ok, err := cdcRecordsStorage.Get(tablePkeyVal) - if err != nil { - return err + if err := addRecordWithKey(tablePkeyVal, rec); err != nil { + return err + } } - if ok { - r.Items = latestRecord.GetItems() - if updateRecord, ok := latestRecord.(*model.UpdateRecord[Items]); ok { - r.UnchangedToastColumns = updateRecord.UnchangedToastColumns + case *model.DeleteRecord[Items]: + isFullReplica := req.TableNameSchemaMapping[tableName].IsReplicaIdentityFull + if isFullReplica { + if err := addRecordWithKey(model.TableWithPkey{}, rec); err != nil { + return err } } else { - // there is nothing to backfill the items in the delete record with, - // so don't update the row with this record - // add sentinel value to prevent update statements from selecting - r.UnchangedToastColumns = map[string]struct{}{ - "_peerdb_not_backfilled_delete": {}, + tablePkeyVal, err := model.RecToTablePKey(req.TableNameSchemaMapping, rec) + if err != nil { + return err } - } - // A delete can only be followed by an INSERT, which does not need backfilling - // No need to store DeleteRecords in memory or disk. - if err := addRecordWithKey(model.TableWithPkey{}, rec); err != nil { - return err + latestRecord, ok, err := cdcRecordsStorage.Get(tablePkeyVal) + if err != nil { + return err + } + if ok { + r.Items = latestRecord.GetItems() + if updateRecord, ok := latestRecord.(*model.UpdateRecord[Items]); ok { + r.UnchangedToastColumns = updateRecord.UnchangedToastColumns + } + } else { + // there is nothing to backfill the items in the delete record with, + // so don't update the row with this record + // add sentinel value to prevent update statements from selecting + r.UnchangedToastColumns = map[string]struct{}{ + "_peerdb_not_backfilled_delete": {}, + } + } + + // A delete can only be followed by an INSERT, which does not need backfilling + // No need to store DeleteRecords in memory or disk. + if err := addRecordWithKey(model.TableWithPkey{}, rec); err != nil { + return err + } } - } - case *model.RelationRecord[Items]: - tableSchemaDelta := r.TableSchemaDelta - if len(tableSchemaDelta.AddedColumns) > 0 { - logger.Info(fmt.Sprintf("Detected schema change for table %s, addedColumns: %v", - tableSchemaDelta.SrcTableName, tableSchemaDelta.AddedColumns)) - records.AddSchemaDelta(req.TableNameMapping, tableSchemaDelta) - } + case *model.RelationRecord[Items]: + tableSchemaDelta := r.TableSchemaDelta + if len(tableSchemaDelta.AddedColumns) > 0 { + logger.Info(fmt.Sprintf("Detected schema change for table %s, addedColumns: %v", + tableSchemaDelta.SrcTableName, tableSchemaDelta.AddedColumns)) + records.AddSchemaDelta(req.TableNameMapping, tableSchemaDelta) + } - case *model.MessageRecord[Items]: - // if cdc store empty, we can move lsn, - // otherwise push to records so destination can ack once all previous messages processed - if cdcRecordsStorage.IsEmpty() { - if int64(clientXLogPos) > req.ConsumedOffset.Load() { - metadata := connmetadata.NewPostgresMetadataFromCatalog(logger, p.catalogPool) - if err := metadata.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: int64(clientXLogPos)}); err != nil { - return err + case *model.MessageRecord[Items]: + // if cdc store empty, we can move lsn, + // otherwise push to records so destination can ack once all previous messages processed + if cdcRecordsStorage.IsEmpty() { + if int64(clientXLogPos) > req.ConsumedOffset.Load() { + metadata := connmetadata.NewPostgresMetadataFromCatalog(logger, p.catalogPool) + if err := metadata.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: int64(clientXLogPos)}); err != nil { + return err + } + req.ConsumedOffset.Store(int64(clientXLogPos)) } - req.ConsumedOffset.Store(int64(clientXLogPos)) + } else if err := records.AddRecord(ctx, rec); err != nil { + return err } - } else if err := records.AddRecord(ctx, rec); err != nil { - return err } } } From b9d891df9195cf3345533149f3a45724d4a9d858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 26 Dec 2024 05:09:15 +0000 Subject: [PATCH 09/80] fix lints --- flow/connectors/mysql/cdc.go | 2 +- flow/connectors/mysql/mysql.go | 1 - flow/connectors/postgres/cdc.go | 7 ++++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index be941d88a0..2922a2e7a1 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -253,7 +253,7 @@ func (c *MySqlConnector) PullRecords( } } - if recordCount >= 0 && recordCount >= req.MaxBatchSize { + if recordCount >= req.MaxBatchSize { return nil } } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index a854814535..44c70efc57 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -147,5 +147,4 @@ func (c *MySqlConnector) GetVersion(ctx context.Context) (string, error) { version, _ := rr.GetString(0, 0) c.logger.Info("[mysql] version", slog.String("version", version)) return version, nil - } diff --git a/flow/connectors/postgres/cdc.go b/flow/connectors/postgres/cdc.go index 70d0a0d82c..eed73f677a 100644 --- a/flow/connectors/postgres/cdc.go +++ b/flow/connectors/postgres/cdc.go @@ -413,8 +413,7 @@ func PullCdcRecords[Items model.Items]( } if p.commitLock == nil { - cdclen := cdcRecordsStorage.Len() - if cdclen >= 0 && uint32(cdclen) >= req.MaxBatchSize { + if cdcRecordsStorage.Len() >= int(req.MaxBatchSize) { return nil } @@ -624,7 +623,9 @@ func PullCdcRecords[Items model.Items]( if cdcRecordsStorage.IsEmpty() { if int64(clientXLogPos) > req.ConsumedOffset.Load() { metadata := connmetadata.NewPostgresMetadataFromCatalog(logger, p.catalogPool) - if err := metadata.SetLastOffset(ctx, req.FlowJobName, model.CdcCheckpoint{ID: int64(clientXLogPos)}); err != nil { + if err := metadata.SetLastOffset( + ctx, req.FlowJobName, model.CdcCheckpoint{ID: int64(clientXLogPos)}, + ); err != nil { return err } req.ConsumedOffset.Store(int64(clientXLogPos)) From fbe1766e41d9192752ffeb898356146988d657f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 26 Dec 2024 05:20:40 +0000 Subject: [PATCH 10/80] fix nexus --- nexus/analyzer/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/analyzer/src/lib.rs b/nexus/analyzer/src/lib.rs index 4d9133ffe5..71064e9367 100644 --- a/nexus/analyzer/src/lib.rs +++ b/nexus/analyzer/src/lib.rs @@ -979,6 +979,7 @@ fn parse_db_options(db_type: DbType, with_options: &[SqlOption]) -> anyhow::Resu .get("disable_tls") .and_then(|s| s.parse::().ok()) .unwrap_or_default(), + flavor: opts.get("flavor").unwrap_or(&"mysql").to_string(), }), })) } From 9af8c58dc468900e11f782fa4f8bb1e47b4f45ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 26 Dec 2024 15:38:29 +0000 Subject: [PATCH 11/80] mysql: GetTableSchemaConnector --- flow/connectors/core.go | 1 + flow/connectors/mysql/cdc.go | 129 ++++++++++++++++++++++++++++++++++- flow/workflows/cdc_flow.go | 3 +- 3 files changed, 130 insertions(+), 3 deletions(-) diff --git a/flow/connectors/core.go b/flow/connectors/core.go index 9450667ac1..3fbb620a32 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -472,6 +472,7 @@ var ( _ CDCNormalizeConnector = &connclickhouse.ClickHouseConnector{} _ GetTableSchemaConnector = &connpostgres.PostgresConnector{} + _ GetTableSchemaConnector = &connmysql.MySqlConnector{} _ GetTableSchemaConnector = &connsnowflake.SnowflakeConnector{} _ GetTableSchemaConnector = &connclickhouse.ClickHouseConnector{} diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 2922a2e7a1..655810c43d 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "log/slog" "time" "github.com/go-mysql-org/go-mysql/mysql" @@ -14,10 +15,12 @@ import ( "go.opentelemetry.io/otel/metric" "github.com/PeerDB-io/peerdb/flow/alerting" + "github.com/PeerDB-io/peerdb/flow/connectors/utils" "github.com/PeerDB-io/peerdb/flow/generated/protos" "github.com/PeerDB-io/peerdb/flow/model" "github.com/PeerDB-io/peerdb/flow/model/qvalue" "github.com/PeerDB-io/peerdb/flow/otel_metrics" + "github.com/PeerDB-io/peerdb/flow/peerdbenv" ) func (c *MySqlConnector) GetTableSchema( @@ -26,7 +29,131 @@ func (c *MySqlConnector) GetTableSchema( system protos.TypeSystem, tableIdentifiers []string, ) (map[string]*protos.TableSchema, error) { - panic("TODO") + res := make(map[string]*protos.TableSchema, len(tableIdentifiers)) + for _, tableName := range tableIdentifiers { + tableSchema, err := c.getTableSchemaForTable(ctx, env, tableName, system) + if err != nil { + c.logger.Info("error fetching schema for table "+tableName, slog.Any("error", err)) + return nil, err + } + res[tableName] = tableSchema + c.logger.Info("fetched schema for table " + tableName) + } + + return res, nil +} + +func (c *MySqlConnector) getTableSchemaForTable( + ctx context.Context, + env map[string]string, + tableName string, + system protos.TypeSystem, +) (*protos.TableSchema, error) { + schemaTable, err := utils.ParseSchemaTable(tableName) + if err != nil { + return nil, err + } + + nullableEnabled, err := peerdbenv.PeerDBNullable(ctx, env) + if err != nil { + return nil, err + } + + rs, err := c.Execute(ctx, fmt.Sprintf("select * from %s limit 0", schemaTable.String())) + if err != nil { + return nil, err + } + columns := make([]*protos.FieldDescription, 0, len(rs.Values)) + primary := make([]string, 0) + for _, field := range rs.Fields { + var qkind qvalue.QValueKind + switch field.Type { + case mysql.MYSQL_TYPE_DECIMAL: + qkind = qvalue.QValueKindNumeric + case mysql.MYSQL_TYPE_TINY: + qkind = qvalue.QValueKindInt16 // TODO qvalue.QValueKindInt8 + case mysql.MYSQL_TYPE_SHORT: + qkind = qvalue.QValueKindInt16 + case mysql.MYSQL_TYPE_LONG: + qkind = qvalue.QValueKindInt32 + case mysql.MYSQL_TYPE_FLOAT: + qkind = qvalue.QValueKindFloat32 + case mysql.MYSQL_TYPE_DOUBLE: + qkind = qvalue.QValueKindFloat64 + case mysql.MYSQL_TYPE_NULL: + qkind = qvalue.QValueKindInvalid // TODO qvalue.QValueKindNothing + case mysql.MYSQL_TYPE_TIMESTAMP: + qkind = qvalue.QValueKindTimestamp + case mysql.MYSQL_TYPE_LONGLONG: + qkind = qvalue.QValueKindInt64 + case mysql.MYSQL_TYPE_INT24: + qkind = qvalue.QValueKindInt32 + case mysql.MYSQL_TYPE_DATE: + qkind = qvalue.QValueKindDate + case mysql.MYSQL_TYPE_TIME: + qkind = qvalue.QValueKindTime + case mysql.MYSQL_TYPE_DATETIME: + qkind = qvalue.QValueKindTimestamp + case mysql.MYSQL_TYPE_YEAR: + qkind = qvalue.QValueKindInt16 + case mysql.MYSQL_TYPE_NEWDATE: + qkind = qvalue.QValueKindDate + case mysql.MYSQL_TYPE_VARCHAR: + qkind = qvalue.QValueKindString + case mysql.MYSQL_TYPE_BIT: + qkind = qvalue.QValueKindInt64 + case mysql.MYSQL_TYPE_TIMESTAMP2: + qkind = qvalue.QValueKindTimestamp + case mysql.MYSQL_TYPE_DATETIME2: + qkind = qvalue.QValueKindTimestamp + case mysql.MYSQL_TYPE_TIME2: + qkind = qvalue.QValueKindTime + case mysql.MYSQL_TYPE_JSON: + qkind = qvalue.QValueKindJSON + case mysql.MYSQL_TYPE_NEWDECIMAL: + qkind = qvalue.QValueKindNumeric + case mysql.MYSQL_TYPE_ENUM: + qkind = qvalue.QValueKindInt64 + case mysql.MYSQL_TYPE_SET: + qkind = qvalue.QValueKindInt64 + case mysql.MYSQL_TYPE_TINY_BLOB: + qkind = qvalue.QValueKindBytes + case mysql.MYSQL_TYPE_MEDIUM_BLOB: + qkind = qvalue.QValueKindBytes + case mysql.MYSQL_TYPE_LONG_BLOB: + qkind = qvalue.QValueKindBytes + case mysql.MYSQL_TYPE_BLOB: + qkind = qvalue.QValueKindBytes + case mysql.MYSQL_TYPE_VAR_STRING: + qkind = qvalue.QValueKindString + case mysql.MYSQL_TYPE_STRING: + qkind = qvalue.QValueKindString + case mysql.MYSQL_TYPE_GEOMETRY: + qkind = qvalue.QValueKindGeometry + default: + return nil, fmt.Errorf("unknown mysql type %d", field.Type) + } + column := &protos.FieldDescription{ + Name: string(field.Name), + Type: string(qkind), + TypeModifier: 0, // TODO numeric precision info + Nullable: (field.Flag & mysql.NOT_NULL_FLAG) == 0, + } + if (field.Flag & mysql.PRI_KEY_FLAG) != 0 { + primary = append(primary, column.Name) + + } + columns = append(columns, column) + } + + return &protos.TableSchema{ + TableIdentifier: tableName, + PrimaryKeyColumns: primary, + IsReplicaIdentityFull: false, + System: system, + NullableEnabled: nullableEnabled, + Columns: columns, + }, nil } func (c *MySqlConnector) EnsurePullability( diff --git a/flow/workflows/cdc_flow.go b/flow/workflows/cdc_flow.go index 497e2d4bb8..800ffde158 100644 --- a/flow/workflows/cdc_flow.go +++ b/flow/workflows/cdc_flow.go @@ -58,8 +58,7 @@ func GetSideEffect[T any](ctx workflow.Context, f func(workflow.Context) T) T { }) var result T - err := sideEffect.Get(&result) - if err != nil { + if err := sideEffect.Get(&result); err != nil { panic(err) } return result From 618d3f179b478b4bf5f57af7bbc7e86b5d0b0e3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 26 Dec 2024 20:25:32 +0000 Subject: [PATCH 12/80] GTIDSet juggling --- flow/activities/flowable.go | 29 +++++++++--------- flow/activities/flowable_core.go | 8 +++-- flow/connectors/core.go | 2 +- flow/connectors/mysql/cdc.go | 45 ++++++++++++++++++++++++---- flow/connectors/mysql/mysql.go | 13 ++++---- flow/connectors/postgres/postgres.go | 3 +- 6 files changed, 69 insertions(+), 31 deletions(-) diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go index 7c0058a5b5..b7aeb1f02a 100644 --- a/flow/activities/flowable.go +++ b/flow/activities/flowable.go @@ -468,15 +468,14 @@ func (a *FlowableActivity) GetQRepPartitions(ctx context.Context, return nil, fmt.Errorf("failed to get partitions from source: %w", err) } if len(partitions) > 0 { - err = monitoring.InitializeQRepRun( + if err := monitoring.InitializeQRepRun( ctx, a.CatalogPool, config, runUUID, partitions, config.ParentMirrorName, - ) - if err != nil { + ); err != nil { return nil, err } } @@ -951,15 +950,15 @@ func (a *FlowableActivity) AddTablesToPublication(ctx context.Context, cfg *prot } defer connectors.CloseConnector(ctx, srcConn) - err = srcConn.AddTablesToPublication(ctx, &protos.AddTablesToPublicationInput{ + if err := srcConn.AddTablesToPublication(ctx, &protos.AddTablesToPublicationInput{ FlowJobName: cfg.FlowJobName, PublicationName: cfg.PublicationName, AdditionalTables: additionalTableMappings, - }) - if err != nil { + }); err != nil { a.Alerter.LogFlowError(ctx, cfg.FlowJobName, err) + return err } - return err + return nil } func (a *FlowableActivity) RemoveTablesFromPublication( @@ -974,15 +973,15 @@ func (a *FlowableActivity) RemoveTablesFromPublication( } defer connectors.CloseConnector(ctx, srcConn) - err = srcConn.RemoveTablesFromPublication(ctx, &protos.RemoveTablesFromPublicationInput{ + if err := srcConn.RemoveTablesFromPublication(ctx, &protos.RemoveTablesFromPublicationInput{ FlowJobName: cfg.FlowJobName, PublicationName: cfg.PublicationName, TablesToRemove: removedTablesMapping, - }) - if err != nil { + }); err != nil { a.Alerter.LogFlowError(ctx, cfg.FlowJobName, err) + return err } - return err + return nil } func (a *FlowableActivity) RemoveTablesFromRawTable( @@ -1020,16 +1019,16 @@ func (a *FlowableActivity) RemoveTablesFromRawTable( for _, table := range tablesToRemove { tableNames = append(tableNames, table.DestinationTableIdentifier) } - err = dstConn.RemoveTableEntriesFromRawTable(ctx, &protos.RemoveTablesFromRawTableInput{ + if err := dstConn.RemoveTableEntriesFromRawTable(ctx, &protos.RemoveTablesFromRawTableInput{ FlowJobName: cfg.FlowJobName, DestinationTableNames: tableNames, SyncBatchId: syncBatchID, NormalizeBatchId: normBatchID, - }) - if err != nil { + }); err != nil { a.Alerter.LogFlowError(ctx, cfg.FlowJobName, err) + return err } - return err + return nil } func (a *FlowableActivity) RemoveTablesFromCatalog( diff --git a/flow/activities/flowable_core.go b/flow/activities/flowable_core.go index f759d4de1a..803c31fa96 100644 --- a/flow/activities/flowable_core.go +++ b/flow/activities/flowable_core.go @@ -224,7 +224,6 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon return nil, a.applySchemaDeltas(ctx, config, options, recordBatchSync.SchemaDeltas) } - var syncStartTime time.Time var res *model.SyncResponse errGroup.Go(func() error { dstConn, err := connectors.GetByNameAs[TSync](ctx, config.Env, a.CatalogPool, config.DestinationName) @@ -251,7 +250,6 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon return err } - syncStartTime = time.Now() res, err = sync(dstConn, errCtx, &model.SyncRecordsRequest[Items]{ SyncBatchID: syncBatchID, Records: recordBatchSync, @@ -271,6 +269,7 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon return nil }) + syncStartTime := time.Now() if err := errGroup.Wait(); err != nil { // don't log flow error for "replState changed" and "slot is already active" if !(temporal.IsApplicationError(err) || @@ -287,7 +286,10 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon syncDuration := time.Since(syncStartTime) lastCheckpoint := recordBatchSync.GetLastCheckpoint() - srcConn.UpdateReplStateLastOffset(lastCheckpoint) + if err := srcConn.UpdateReplStateLastOffset(ctx, lastCheckpoint); err != nil { + a.Alerter.LogFlowError(ctx, flowName, err) + return 0, err + } if err := monitoring.UpdateNumRowsAndEndLSNForCDCBatch( ctx, a.CatalogPool, flowName, res.CurrentSyncBatchID, uint32(res.NumRecordsSynced), lastCheckpoint, diff --git a/flow/connectors/core.go b/flow/connectors/core.go index 3fbb620a32..a72b85e36f 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -74,7 +74,7 @@ type CDCPullConnectorCore interface { ReplPing(context.Context) error // Called when offset has been confirmed to destination - UpdateReplStateLastOffset(lastOffset model.CdcCheckpoint) + UpdateReplStateLastOffset(ctx context.Context, lastOffset model.CdcCheckpoint) error // PullFlowCleanup drops both the Postgres publication and replication slot, as a part of DROP MIRROR PullFlowCleanup(ctx context.Context, jobName string) error diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 655810c43d..db772799f7 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -21,6 +21,7 @@ import ( "github.com/PeerDB-io/peerdb/flow/model/qvalue" "github.com/PeerDB-io/peerdb/flow/otel_metrics" "github.com/PeerDB-io/peerdb/flow/peerdbenv" + "github.com/PeerDB-io/peerdb/flow/shared" ) func (c *MySqlConnector) GetTableSchema( @@ -141,7 +142,6 @@ func (c *MySqlConnector) getTableSchemaForTable( } if (field.Flag & mysql.PRI_KEY_FLAG) != 0 { primary = append(primary, column.Name) - } columns = append(columns, column) } @@ -171,8 +171,24 @@ func (c *MySqlConnector) FinishExport(any) error { return nil } -func (c *MySqlConnector) SetupReplConn(context.Context) error { +func (c *MySqlConnector) SetupReplConn(ctx context.Context) error { // mysql code will spin up new connection for each normalize for now + flowName := ctx.Value(shared.FlowNameKey).(string) + offset, err := c.GetLastOffset(ctx, flowName) + if err != nil { + return fmt.Errorf("[mysql] SetupReplConn failed to GetLastOffset: %w", err) + } + if offset.Text == "" { + set, err := c.GetMasterGTIDSet(ctx) + if err != nil { + return fmt.Errorf("[mysql] SetupReplConn failed to GetMasterGTIDSet: %w", err) + } + if err := c.SetLastOffset( + ctx, flowName, model.CdcCheckpoint{Text: set.String()}, + ); err != nil { + return fmt.Errorf("[mysql] SetupReplConn failed to SetLastOffset: %w", err) + } + } return nil } @@ -190,7 +206,10 @@ func (c *MySqlConnector) ReplPing(context.Context) error { return nil } -func (c *MySqlConnector) UpdateReplStateLastOffset(lastOffset model.CdcCheckpoint) { +func (c *MySqlConnector) UpdateReplStateLastOffset(ctx context.Context, lastOffset model.CdcCheckpoint) error { + // TODO assert c.replState == lastOffset + flowName := ctx.Value(shared.FlowNameKey).(string) + return c.SetLastOffset(ctx, flowName, lastOffset) } func (c *MySqlConnector) PullFlowCleanup(ctx context.Context, jobName string) error { @@ -284,7 +303,6 @@ func (c *MySqlConnector) PullRecords( ) error { defer func() { req.RecordStream.Close() - // update replState Offset }() gset, err := mysql.ParseGTIDSet(c.config.Flavor, req.LastOffset.Text) if err != nil { @@ -319,7 +337,24 @@ func (c *MySqlConnector) PullRecords( ))) } - if ev, ok := event.Event.(*replication.RowsEvent); ok { + switch ev := event.Event.(type) { + case *replication.MariadbGTIDEvent: + var err error + newset, err := ev.GTIDNext() + if err != nil { + // TODO could ignore, but then we might get stuck rereading same batch each time + return err + } + c.replState = newset + case *replication.GTIDEvent: + var err error + newset, err := ev.GTIDNext() + if err != nil { + // TODO could ignore, but then we might get stuck rereading same batch each time + return err + } + c.replState = newset + case *replication.RowsEvent: sourceTableName := string(ev.Table.Table) // TODO need ev.Table.Schema? destinationTableName := req.TableNameMapping[sourceTableName].Name schema := req.TableNameSchemaMapping[destinationTableName] diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 44c70efc57..34ce96af75 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -21,10 +21,11 @@ import ( type MySqlConnector struct { *metadataStore.PostgresMetadata - config *protos.MySqlConfig - conn *client.Conn - syncer *replication.BinlogSyncer - logger log.Logger + config *protos.MySqlConfig + conn *client.Conn + syncer *replication.BinlogSyncer + logger log.Logger + replState mysql.GTIDSet } func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlConnector, error) { @@ -85,7 +86,7 @@ func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interf } } - rr, err := c.conn.Execute(cmd, args...) + rs, err := c.conn.Execute(cmd, args...) if err != nil { if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { reconnects -= 1 @@ -95,7 +96,7 @@ func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interf } return nil, err } - return rr, nil + return rs, nil } } diff --git a/flow/connectors/postgres/postgres.go b/flow/connectors/postgres/postgres.go index 8c7505ed92..2c97c9d75e 100644 --- a/flow/connectors/postgres/postgres.go +++ b/flow/connectors/postgres/postgres.go @@ -448,10 +448,11 @@ func pullCore[Items model.Items]( return nil } -func (c *PostgresConnector) UpdateReplStateLastOffset(lastOffset model.CdcCheckpoint) { +func (c *PostgresConnector) UpdateReplStateLastOffset(_ context.Context, lastOffset model.CdcCheckpoint) error { if c.replState != nil { c.replState.LastOffset.Store(lastOffset.ID) } + return nil } func (c *PostgresConnector) SyncRecords(ctx context.Context, req *model.SyncRecordsRequest[model.RecordItems]) (*model.SyncResponse, error) { From d81659901ffdabaff86287000801646b5dbc2365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 27 Dec 2024 18:59:12 +0000 Subject: [PATCH 13/80] skip validation if source mysql, don't require CDCSyncConnector implemented on source peer --- flow/activities/flowable.go | 34 +++++++++++++++++----- flow/cmd/validate_mirror.go | 5 ++++ flow/connectors/core.go | 2 +- flow/connectors/eventhub/eventhub.go | 8 ----- flow/connectors/external_metadata/store.go | 4 +-- flow/connectors/kafka/kafka.go | 8 ----- flow/connectors/postgres/client.go | 5 ++-- flow/connectors/postgres/postgres.go | 7 ++--- flow/workflows/setup_flow.go | 15 ++++------ 9 files changed, 45 insertions(+), 43 deletions(-) diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go index b7aeb1f02a..661d326667 100644 --- a/flow/activities/flowable.go +++ b/flow/activities/flowable.go @@ -32,8 +32,7 @@ import ( "github.com/PeerDB-io/peerdb/flow/shared" ) -// CheckConnectionResult is the result of a CheckConnection call. -type CheckConnectionResult struct { +type CheckMetadataTablesResult struct { NeedsSetupMetadataTables bool } @@ -55,18 +54,39 @@ type StreamCloser interface { func (a *FlowableActivity) CheckConnection( ctx context.Context, config *protos.SetupInput, -) (*CheckConnectionResult, error) { +) error { ctx = context.WithValue(ctx, shared.FlowNameKey, config.FlowName) - dstConn, err := connectors.GetByNameAs[connectors.CDCSyncConnector](ctx, config.Env, a.CatalogPool, config.PeerName) + conn, err := connectors.GetByNameAs[connectors.CDCSyncConnector](ctx, config.Env, a.CatalogPool, config.PeerName) + if err != nil { + if errors.Is(err, errors.ErrUnsupported) { + return nil + } + a.Alerter.LogFlowError(ctx, config.FlowName, err) + return fmt.Errorf("failed to get connector: %w", err) + } + defer connectors.CloseConnector(ctx, conn) + + return conn.ConnectionActive(ctx) +} + +func (a *FlowableActivity) CheckMetadataTables( + ctx context.Context, + config *protos.SetupInput, +) (*CheckMetadataTablesResult, error) { + ctx = context.WithValue(ctx, shared.FlowNameKey, config.FlowName) + conn, err := connectors.GetByNameAs[connectors.CDCSyncConnector](ctx, config.Env, a.CatalogPool, config.PeerName) if err != nil { a.Alerter.LogFlowError(ctx, config.FlowName, err) return nil, fmt.Errorf("failed to get connector: %w", err) } - defer connectors.CloseConnector(ctx, dstConn) + defer connectors.CloseConnector(ctx, conn) - needsSetup := dstConn.NeedsSetupMetadataTables(ctx) + needsSetup, err := conn.NeedsSetupMetadataTables(ctx) + if err != nil { + return nil, err + } - return &CheckConnectionResult{ + return &CheckMetadataTablesResult{ NeedsSetupMetadataTables: needsSetup, }, nil } diff --git a/flow/cmd/validate_mirror.go b/flow/cmd/validate_mirror.go index fe94e9ff94..3852b576a9 100644 --- a/flow/cmd/validate_mirror.go +++ b/flow/cmd/validate_mirror.go @@ -63,6 +63,11 @@ func (h *FlowRequestHandler) ValidateCDCMirror( sourcePeerConfig := sourcePeer.GetPostgresConfig() if sourcePeerConfig == nil { + if sourcePeer.GetMysqlConfig() != nil { + // TODO mysql validation + // eg disable json diff, only row based replication supported, ... + return &protos.ValidateCDCMirrorResponse{}, nil + } slog.Error("/validatecdc source peer config is not postgres", slog.String("peer", req.ConnectionConfigs.SourceName)) return nil, errors.New("source peer config is not postgres") } diff --git a/flow/connectors/core.go b/flow/connectors/core.go index a72b85e36f..c2954ac404 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -150,7 +150,7 @@ type CDCSyncConnectorCore interface { Connector // NeedsSetupMetadataTables checks if the metadata table [PEERDB_MIRROR_JOBS] needs to be created. - NeedsSetupMetadataTables(ctx context.Context) bool + NeedsSetupMetadataTables(ctx context.Context) (bool, error) // SetupMetadataTables creates the metadata table [PEERDB_MIRROR_JOBS] if necessary. SetupMetadataTables(ctx context.Context) error diff --git a/flow/connectors/eventhub/eventhub.go b/flow/connectors/eventhub/eventhub.go index 788df4f754..16a6c5a770 100644 --- a/flow/connectors/eventhub/eventhub.go +++ b/flow/connectors/eventhub/eventhub.go @@ -75,14 +75,6 @@ func (c *EventHubConnector) ConnectionActive(ctx context.Context) error { return nil } -func (c *EventHubConnector) NeedsSetupMetadataTables(_ context.Context) bool { - return false -} - -func (c *EventHubConnector) SetupMetadataTables(_ context.Context) error { - return nil -} - func lvalueToEventData(ls *lua.LState, value lua.LValue) (ScopedEventhubData, error) { var scoped ScopedEventhubData switch v := value.(type) { diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index 7df5267588..77a02792b8 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -64,8 +64,8 @@ func (p *PostgresMetadata) LogFlowInfo(ctx context.Context, flowName string, inf return err } -func (p *PostgresMetadata) NeedsSetupMetadataTables(_ context.Context) bool { - return false +func (p *PostgresMetadata) NeedsSetupMetadataTables(_ context.Context) (bool, error) { + return false, nil } func (p *PostgresMetadata) SetupMetadataTables(_ context.Context) error { diff --git a/flow/connectors/kafka/kafka.go b/flow/connectors/kafka/kafka.go index 8f94b9d1db..0681def8bb 100644 --- a/flow/connectors/kafka/kafka.go +++ b/flow/connectors/kafka/kafka.go @@ -141,14 +141,6 @@ func (c *KafkaConnector) CreateRawTable(ctx context.Context, req *protos.CreateR return &protos.CreateRawTableOutput{TableIdentifier: "n/a"}, nil } -func (c *KafkaConnector) NeedsSetupMetadataTables(_ context.Context) bool { - return false -} - -func (c *KafkaConnector) SetupMetadataTables(_ context.Context) error { - return nil -} - func (c *KafkaConnector) ReplayTableSchemaDeltas(_ context.Context, _ map[string]string, flowJobName string, schemaDeltas []*protos.TableSchemaDelta, ) error { diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go index a65fc588c1..1dd2145680 100644 --- a/flow/connectors/postgres/client.go +++ b/flow/connectors/postgres/client.go @@ -216,7 +216,7 @@ func (c *PostgresConnector) getNullableColumns(ctx context.Context, relID uint32 func (c *PostgresConnector) tableExists(ctx context.Context, schemaTable *utils.SchemaTable) (bool, error) { var exists pgtype.Bool - err := c.conn.QueryRow(ctx, + if err := c.conn.QueryRow(ctx, `SELECT EXISTS ( SELECT FROM pg_tables WHERE schemaname = $1 @@ -224,8 +224,7 @@ func (c *PostgresConnector) tableExists(ctx context.Context, schemaTable *utils. )`, schemaTable.Schema, schemaTable.Table, - ).Scan(&exists) - if err != nil { + ).Scan(&exists); err != nil { return false, fmt.Errorf("error checking if table exists: %w", err) } diff --git a/flow/connectors/postgres/postgres.go b/flow/connectors/postgres/postgres.go index 2c97c9d75e..24cfcff9f9 100644 --- a/flow/connectors/postgres/postgres.go +++ b/flow/connectors/postgres/postgres.go @@ -274,15 +274,12 @@ func (c *PostgresConnector) ConnectionActive(ctx context.Context) error { } // NeedsSetupMetadataTables returns true if the metadata tables need to be set up. -func (c *PostgresConnector) NeedsSetupMetadataTables(ctx context.Context) bool { +func (c *PostgresConnector) NeedsSetupMetadataTables(ctx context.Context) (bool, error) { result, err := c.tableExists(ctx, &utils.SchemaTable{ Schema: c.metadataSchema, Table: mirrorJobsTableIdentifier, }) - if err != nil { - return true - } - return !result + return !result, err } // SetupMetadataTables sets up the metadata tables. diff --git a/flow/workflows/setup_flow.go b/flow/workflows/setup_flow.go index 26ce9bafa9..752ce7ec77 100644 --- a/flow/workflows/setup_flow.go +++ b/flow/workflows/setup_flow.go @@ -66,20 +66,18 @@ func (s *SetupFlowExecution) checkConnectionsAndSetupMetadataTables( PeerName: config.SourceName, FlowName: config.FlowJobName, }) - var srcConnStatus activities.CheckConnectionResult - if err := srcConnStatusFuture.Get(checkCtx, &srcConnStatus); err != nil { - return fmt.Errorf("failed to check source peer connection: %w", err) - } - dstSetupInput := &protos.SetupInput{ Env: config.Env, PeerName: config.DestinationName, FlowName: config.FlowJobName, } + destConnStatusFuture := workflow.ExecuteLocalActivity(checkCtx, flowable.CheckMetadataTables, dstSetupInput) + if err := srcConnStatusFuture.Get(checkCtx, nil); err != nil { + return fmt.Errorf("failed to check source peer connection: %w", err) + } // then check the destination peer connection - destConnStatusFuture := workflow.ExecuteLocalActivity(checkCtx, flowable.CheckConnection, dstSetupInput) - var destConnStatus activities.CheckConnectionResult + var destConnStatus activities.CheckMetadataTablesResult if err := destConnStatusFuture.Get(checkCtx, &destConnStatus); err != nil { return fmt.Errorf("failed to check destination peer connection: %w", err) } @@ -94,8 +92,7 @@ func (s *SetupFlowExecution) checkConnectionsAndSetupMetadataTables( InitialInterval: 1 * time.Minute, }, }) - fDst := workflow.ExecuteActivity(setupCtx, flowable.SetupMetadataTables, dstSetupInput) - if err := fDst.Get(setupCtx, nil); err != nil { + if err := workflow.ExecuteActivity(setupCtx, flowable.SetupMetadataTables, dstSetupInput).Get(setupCtx, nil); err != nil { return fmt.Errorf("failed to setup destination peer metadata tables: %w", err) } } else { From 25c60e7f2bc10ad58c78c0507ccacb593c8b2ab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 27 Dec 2024 19:06:38 +0000 Subject: [PATCH 14/80] turns out MySQL defaults to ANSI_QUOTES being disabled, so use backticks which always works --- flow/connectors/mysql/cdc.go | 2 +- flow/connectors/utils/identifiers.go | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index db772799f7..f88359a8e8 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -60,7 +60,7 @@ func (c *MySqlConnector) getTableSchemaForTable( return nil, err } - rs, err := c.Execute(ctx, fmt.Sprintf("select * from %s limit 0", schemaTable.String())) + rs, err := c.Execute(ctx, fmt.Sprintf("select * from %s limit 0", schemaTable.MySQL())) if err != nil { return nil, err } diff --git a/flow/connectors/utils/identifiers.go b/flow/connectors/utils/identifiers.go index 19867971a9..2066d49a65 100644 --- a/flow/connectors/utils/identifiers.go +++ b/flow/connectors/utils/identifiers.go @@ -20,6 +20,10 @@ func (t *SchemaTable) String() string { return fmt.Sprintf(`"%s"."%s"`, t.Schema, t.Table) } +func (t *SchemaTable) MySQL() string { + return fmt.Sprintf("`%s`.`%s`", t.Schema, t.Table) +} + // ParseSchemaTable parses a table name into schema and table name. func ParseSchemaTable(tableName string) (*SchemaTable, error) { schema, table, hasDot := strings.Cut(tableName, ".") From 9ba7f66c6667ffdea301d9e669823da4c88daa13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 27 Dec 2024 19:13:22 +0000 Subject: [PATCH 15/80] fix nil deref --- flow/connectors/mysql/mysql.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 34ce96af75..ca92989e0c 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -29,6 +29,10 @@ type MySqlConnector struct { } func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlConnector, error) { + pgMetadata, err := metadataStore.NewPostgresMetadata(ctx) + if err != nil { + return nil, err + } syncer := replication.NewBinlogSyncer(replication.BinlogSyncerConfig{ ServerID: 1729, // TODO put in config (or generate randomly, which is what go-mysql-org does) Flavor: config.Flavor, @@ -40,9 +44,10 @@ func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlC ParseTime: true, }) return &MySqlConnector{ - config: config, - syncer: syncer, - logger: shared.LoggerFromCtx(ctx), + PostgresMetadata: pgMetadata, + config: config, + syncer: syncer, + logger: shared.LoggerFromCtx(ctx), }, nil } From 3186f3a5fd5a52d968d1650dad0c18b7814a1620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 27 Dec 2024 20:53:12 +0000 Subject: [PATCH 16/80] fixes --- flow/connectors/mysql/cdc.go | 22 ++++++++++--------- flow/connectors/postgres/client.go | 4 ++-- .../migrations/V42__mysql_metadata.sql | 1 + 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index f88359a8e8..08657643db 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -301,9 +301,7 @@ func (c *MySqlConnector) PullRecords( otelManager *otel_metrics.OtelManager, req *model.PullRecordsRequest[model.RecordItems], ) error { - defer func() { - req.RecordStream.Close() - }() + defer req.RecordStream.Close() gset, err := mysql.ParseGTIDSet(c.config.Flavor, req.LastOffset.Text) if err != nil { return err @@ -323,11 +321,16 @@ func (c *MySqlConnector) PullRecords( } } + timeoutCtx, cancelTimeout := context.WithTimeout(ctx, req.IdleTimeout) + defer cancelTimeout() + var recordCount uint32 for { - // TODO put req.IdleTimeout timer on this - event, err := mystream.GetEvent(ctx) + event, err := mystream.GetEvent(timeoutCtx) if err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return nil + } return err } @@ -355,13 +358,12 @@ func (c *MySqlConnector) PullRecords( } c.replState = newset case *replication.RowsEvent: - sourceTableName := string(ev.Table.Table) // TODO need ev.Table.Schema? + sourceTableName := string(ev.Table.Schema) + "." + string(ev.Table.Table) // TODO this is fragile destinationTableName := req.TableNameMapping[sourceTableName].Name schema := req.TableNameSchemaMapping[destinationTableName] for _, row := range ev.Rows { var record model.Record[model.RecordItems] - // TODO need mapping of column index to column name - var items model.RecordItems + items := model.NewRecordItems(len(row)) switch event.Header.EventType { case replication.WRITE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv0: return errors.New("mysql v0 replication protocol not supported") @@ -377,10 +379,10 @@ func (c *MySqlConnector) PullRecords( DestinationTableName: destinationTableName, } case replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2: - var oldItems model.RecordItems + oldItems := model.NewRecordItems(len(row) / 2) for idx, val := range row { fd := schema.Columns[idx>>1] - qv := qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val) + qv := qvalueFromMysql(ev.Table.ColumnType[idx>>1], qvalue.QValueKind(fd.Type), val) if (idx & 1) == 0 { // TODO test that it isn't other way around oldItems.AddColumn(fd.Name, qv) } else { diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go index 1dd2145680..89505e398f 100644 --- a/flow/connectors/postgres/client.go +++ b/flow/connectors/postgres/client.go @@ -41,9 +41,9 @@ const ( getLastNormalizeBatchID_SQL = "SELECT normalize_batch_id FROM %s.%s WHERE mirror_job_name=$1" createNormalizedTableSQL = "CREATE TABLE IF NOT EXISTS %s(%s)" checkTableExistsSQL = "SELECT EXISTS (SELECT 1 FROM pg_catalog.pg_tables WHERE schemaname = $1 AND tablename = $2)" - upsertJobMetadataForSyncSQL = `INSERT INTO %s.%s (mirror_job_name, lsn_offset, lsn_text, sync_batch_id) AS j VALUES ($1,$2,$3,$4) + upsertJobMetadataForSyncSQL = `INSERT INTO %s.%s AS j (mirror_job_name,lsn_offset,lsn_text,sync_batch_id,normalize_batch_id) VALUES ($1,$2,$3,$4,0) ON CONFLICT(mirror_job_name) DO UPDATE SET lsn_offset=GREATEST(j.lsn_offset, EXCLUDED.lsn_offset), - lsn_text = EXCLUDED.lsn_text, sync_batch_id=EXCLUDED.sync_batch_id` + lsn_text=EXCLUDED.lsn_text, sync_batch_id=EXCLUDED.sync_batch_id` checkIfJobMetadataExistsSQL = "SELECT COUNT(1)::TEXT::BOOL FROM %s.%s WHERE mirror_job_name=$1" updateMetadataForNormalizeRecordsSQL = "UPDATE %s.%s SET normalize_batch_id=$1 WHERE mirror_job_name=$2" diff --git a/nexus/catalog/migrations/V42__mysql_metadata.sql b/nexus/catalog/migrations/V42__mysql_metadata.sql index e5558667e1..cbc75934de 100644 --- a/nexus/catalog/migrations/V42__mysql_metadata.sql +++ b/nexus/catalog/migrations/V42__mysql_metadata.sql @@ -1,2 +1,3 @@ ALTER TABLE metadata_last_sync_state ADD COLUMN IF NOT EXISTS last_text text NOT NULL DEFAULT ''; +ALTER TABLE peerdb_stats.cdc_batches ADD COLUMN IF NOT EXISTS batch_end_lsn_text text NOT NULL DEFAULT ''; From 77448117651db7e394a5d81c690b5e547e8137c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 27 Dec 2024 21:20:20 +0000 Subject: [PATCH 17/80] fail fast --- flow/connectors/mysql/cdc.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 08657643db..25da54426b 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -289,10 +289,8 @@ func qvalueFromMysql(mytype byte, qkind qvalue.QValueKind, val any) qvalue.QValu mysql.MYSQL_TYPE_STRING: return qvalue.QValueString{Val: val} } - default: - panic(fmt.Sprintf("unexpected type %T for mysql type %d", val, mytype)) } - return nil + panic(fmt.Sprintf("unexpected type %T for mysql type %d", val, mytype)) } func (c *MySqlConnector) PullRecords( @@ -367,7 +365,7 @@ func (c *MySqlConnector) PullRecords( switch event.Header.EventType { case replication.WRITE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv0: return errors.New("mysql v0 replication protocol not supported") - case replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2: + case replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2, replication.MARIADB_WRITE_ROWS_COMPRESSED_EVENT_V1: for idx, val := range row { fd := schema.Columns[idx] items.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) @@ -378,7 +376,7 @@ func (c *MySqlConnector) PullRecords( SourceTableName: sourceTableName, DestinationTableName: destinationTableName, } - case replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2: + case replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2, replication.MARIADB_UPDATE_ROWS_COMPRESSED_EVENT_V1: oldItems := model.NewRecordItems(len(row) / 2) for idx, val := range row { fd := schema.Columns[idx>>1] @@ -396,7 +394,7 @@ func (c *MySqlConnector) PullRecords( SourceTableName: sourceTableName, DestinationTableName: destinationTableName, } - case replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2: + case replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2, replication.MARIADB_DELETE_ROWS_COMPRESSED_EVENT_V1: for idx, val := range row { fd := schema.Columns[idx] items.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) From 70fc2afb2fea99404600d2a4a61a76583f4ef064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Sat, 28 Dec 2024 02:27:21 +0000 Subject: [PATCH 18/80] fix updates, fix syncer already open error --- flow/connectors/mysql/cdc.go | 76 +++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 25da54426b..e92af32d8c 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -202,6 +202,10 @@ func (c *MySqlConnector) startCdcStreamingGtid(gset mysql.GTIDSet) (*replication return c.syncer.StartSyncGTID(gset) } +func (c *MySqlConnector) closeSyncer() { + c.syncer.Close() +} + func (c *MySqlConnector) ReplPing(context.Context) error { return nil } @@ -308,6 +312,7 @@ func (c *MySqlConnector) PullRecords( if err != nil { return err } + defer c.closeSyncer() var fetchedBytesCounter metric.Int64Counter if otelManager != nil { @@ -359,59 +364,74 @@ func (c *MySqlConnector) PullRecords( sourceTableName := string(ev.Table.Schema) + "." + string(ev.Table.Table) // TODO this is fragile destinationTableName := req.TableNameMapping[sourceTableName].Name schema := req.TableNameSchemaMapping[destinationTableName] - for _, row := range ev.Rows { - var record model.Record[model.RecordItems] - items := model.NewRecordItems(len(row)) - switch event.Header.EventType { - case replication.WRITE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv0: - return errors.New("mysql v0 replication protocol not supported") - case replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2, replication.MARIADB_WRITE_ROWS_COMPRESSED_EVENT_V1: + switch event.Header.EventType { + case replication.WRITE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv0: + return errors.New("mysql v0 replication protocol not supported") + case replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2, replication.MARIADB_WRITE_ROWS_COMPRESSED_EVENT_V1: + for _, row := range ev.Rows { + items := model.NewRecordItems(len(row)) for idx, val := range row { fd := schema.Columns[idx] items.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } - record = &model.InsertRecord[model.RecordItems]{ + + recordCount += 1 + if err := req.RecordStream.AddRecord(ctx, &model.InsertRecord[model.RecordItems]{ BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, Items: items, SourceTableName: sourceTableName, DestinationTableName: destinationTableName, + }); err != nil { + return err } - case replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2, replication.MARIADB_UPDATE_ROWS_COMPRESSED_EVENT_V1: - oldItems := model.NewRecordItems(len(row) / 2) - for idx, val := range row { - fd := schema.Columns[idx>>1] - qv := qvalueFromMysql(ev.Table.ColumnType[idx>>1], qvalue.QValueKind(fd.Type), val) - if (idx & 1) == 0 { // TODO test that it isn't other way around - oldItems.AddColumn(fd.Name, qv) - } else { - items.AddColumn(fd.Name, qv) - } + } + case replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2, replication.MARIADB_UPDATE_ROWS_COMPRESSED_EVENT_V1: + // TODO populate UnchangedToastColumns with ev.SkippedColumns + for idx := 0; idx < len(ev.Rows); idx += 2 { + oldRow := ev.Rows[idx] + oldItems := model.NewRecordItems(len(oldRow)) + for idx, val := range oldRow { + fd := schema.Columns[idx] + oldItems.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } - record = &model.UpdateRecord[model.RecordItems]{ + newRow := ev.Rows[idx+1] + newItems := model.NewRecordItems(len(newRow)) + for idx, val := range ev.Rows[idx+1] { + fd := schema.Columns[idx] + newItems.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + } + + recordCount += 1 + if err := req.RecordStream.AddRecord(ctx, &model.UpdateRecord[model.RecordItems]{ BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, OldItems: oldItems, - NewItems: items, + NewItems: newItems, SourceTableName: sourceTableName, DestinationTableName: destinationTableName, + }); err != nil { + return err } - case replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2, replication.MARIADB_DELETE_ROWS_COMPRESSED_EVENT_V1: + } + case replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2, replication.MARIADB_DELETE_ROWS_COMPRESSED_EVENT_V1: + for _, row := range ev.Rows { + items := model.NewRecordItems(len(row)) for idx, val := range row { fd := schema.Columns[idx] items.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } - record = &model.DeleteRecord[model.RecordItems]{ + + recordCount += 1 + if err := req.RecordStream.AddRecord(ctx, &model.DeleteRecord[model.RecordItems]{ BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, Items: items, SourceTableName: sourceTableName, DestinationTableName: destinationTableName, + }); err != nil { + return err } - default: - continue - } - recordCount += 1 - if err := req.RecordStream.AddRecord(ctx, record); err != nil { - return err } + default: + continue } } From ead7c8b5970721ef09ee212da44020013695db84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Sat, 28 Dec 2024 13:38:17 +0000 Subject: [PATCH 19/80] fix lints --- flow/connectors/postgres/client.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go index 89505e398f..cfda1d2a01 100644 --- a/flow/connectors/postgres/client.go +++ b/flow/connectors/postgres/client.go @@ -41,8 +41,8 @@ const ( getLastNormalizeBatchID_SQL = "SELECT normalize_batch_id FROM %s.%s WHERE mirror_job_name=$1" createNormalizedTableSQL = "CREATE TABLE IF NOT EXISTS %s(%s)" checkTableExistsSQL = "SELECT EXISTS (SELECT 1 FROM pg_catalog.pg_tables WHERE schemaname = $1 AND tablename = $2)" - upsertJobMetadataForSyncSQL = `INSERT INTO %s.%s AS j (mirror_job_name,lsn_offset,lsn_text,sync_batch_id,normalize_batch_id) VALUES ($1,$2,$3,$4,0) - ON CONFLICT(mirror_job_name) DO UPDATE SET lsn_offset=GREATEST(j.lsn_offset, EXCLUDED.lsn_offset), + upsertJobMetadataForSyncSQL = `INSERT INTO %s.%s AS j (mirror_job_name,lsn_offset,lsn_text,sync_batch_id,normalize_batch_id) + VALUES ($1,$2,$3,$4,0) ON CONFLICT(mirror_job_name) DO UPDATE SET lsn_offset=GREATEST(j.lsn_offset, EXCLUDED.lsn_offset), lsn_text=EXCLUDED.lsn_text, sync_batch_id=EXCLUDED.sync_batch_id` checkIfJobMetadataExistsSQL = "SELECT COUNT(1)::TEXT::BOOL FROM %s.%s WHERE mirror_job_name=$1" updateMetadataForNormalizeRecordsSQL = "UPDATE %s.%s SET normalize_batch_id=$1 WHERE mirror_job_name=$2" From 769c9adbb0c402120e9fcea8bf63128a9bf7cd37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 31 Dec 2024 18:25:41 +0000 Subject: [PATCH 20/80] qrep first draft --- flow/connectors/core.go | 1 + flow/connectors/mysql/cdc.go | 189 ++++++++--------------- flow/connectors/mysql/mysql.go | 116 +++++++++++++++ flow/connectors/mysql/qrep.go | 239 ++++++++++++++++++++++++++++++ flow/connectors/sqlserver/qrep.go | 66 ++++----- 5 files changed, 451 insertions(+), 160 deletions(-) create mode 100644 flow/connectors/mysql/qrep.go diff --git a/flow/connectors/core.go b/flow/connectors/core.go index c2954ac404..8c5539dec5 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -485,6 +485,7 @@ var ( _ CreateTablesFromExistingConnector = &connsnowflake.SnowflakeConnector{} _ QRepPullConnector = &connpostgres.PostgresConnector{} + _ QRepPullConnector = &connmysql.MySqlConnector{} _ QRepPullConnector = &connsqlserver.SQLServerConnector{} _ QRepPullPgConnector = &connpostgres.PostgresConnector{} diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index e92af32d8c..d8ce16753a 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -66,74 +66,13 @@ func (c *MySqlConnector) getTableSchemaForTable( } columns := make([]*protos.FieldDescription, 0, len(rs.Values)) primary := make([]string, 0) + for _, field := range rs.Fields { - var qkind qvalue.QValueKind - switch field.Type { - case mysql.MYSQL_TYPE_DECIMAL: - qkind = qvalue.QValueKindNumeric - case mysql.MYSQL_TYPE_TINY: - qkind = qvalue.QValueKindInt16 // TODO qvalue.QValueKindInt8 - case mysql.MYSQL_TYPE_SHORT: - qkind = qvalue.QValueKindInt16 - case mysql.MYSQL_TYPE_LONG: - qkind = qvalue.QValueKindInt32 - case mysql.MYSQL_TYPE_FLOAT: - qkind = qvalue.QValueKindFloat32 - case mysql.MYSQL_TYPE_DOUBLE: - qkind = qvalue.QValueKindFloat64 - case mysql.MYSQL_TYPE_NULL: - qkind = qvalue.QValueKindInvalid // TODO qvalue.QValueKindNothing - case mysql.MYSQL_TYPE_TIMESTAMP: - qkind = qvalue.QValueKindTimestamp - case mysql.MYSQL_TYPE_LONGLONG: - qkind = qvalue.QValueKindInt64 - case mysql.MYSQL_TYPE_INT24: - qkind = qvalue.QValueKindInt32 - case mysql.MYSQL_TYPE_DATE: - qkind = qvalue.QValueKindDate - case mysql.MYSQL_TYPE_TIME: - qkind = qvalue.QValueKindTime - case mysql.MYSQL_TYPE_DATETIME: - qkind = qvalue.QValueKindTimestamp - case mysql.MYSQL_TYPE_YEAR: - qkind = qvalue.QValueKindInt16 - case mysql.MYSQL_TYPE_NEWDATE: - qkind = qvalue.QValueKindDate - case mysql.MYSQL_TYPE_VARCHAR: - qkind = qvalue.QValueKindString - case mysql.MYSQL_TYPE_BIT: - qkind = qvalue.QValueKindInt64 - case mysql.MYSQL_TYPE_TIMESTAMP2: - qkind = qvalue.QValueKindTimestamp - case mysql.MYSQL_TYPE_DATETIME2: - qkind = qvalue.QValueKindTimestamp - case mysql.MYSQL_TYPE_TIME2: - qkind = qvalue.QValueKindTime - case mysql.MYSQL_TYPE_JSON: - qkind = qvalue.QValueKindJSON - case mysql.MYSQL_TYPE_NEWDECIMAL: - qkind = qvalue.QValueKindNumeric - case mysql.MYSQL_TYPE_ENUM: - qkind = qvalue.QValueKindInt64 - case mysql.MYSQL_TYPE_SET: - qkind = qvalue.QValueKindInt64 - case mysql.MYSQL_TYPE_TINY_BLOB: - qkind = qvalue.QValueKindBytes - case mysql.MYSQL_TYPE_MEDIUM_BLOB: - qkind = qvalue.QValueKindBytes - case mysql.MYSQL_TYPE_LONG_BLOB: - qkind = qvalue.QValueKindBytes - case mysql.MYSQL_TYPE_BLOB: - qkind = qvalue.QValueKindBytes - case mysql.MYSQL_TYPE_VAR_STRING: - qkind = qvalue.QValueKindString - case mysql.MYSQL_TYPE_STRING: - qkind = qvalue.QValueKindString - case mysql.MYSQL_TYPE_GEOMETRY: - qkind = qvalue.QValueKindGeometry - default: - return nil, fmt.Errorf("unknown mysql type %d", field.Type) + qkind, err := qkindFromMysql(field.Type) + if err != nil { + return nil, err } + column := &protos.FieldDescription{ Name: string(field.Name), Type: string(qkind), @@ -242,61 +181,6 @@ func (c *MySqlConnector) RemoveTablesFromPublication(ctx context.Context, req *p return nil } -func qvalueFromMysql(mytype byte, qkind qvalue.QValueKind, val any) qvalue.QValue { - // TODO signedness, in ev.Table, need to extend QValue system - // See go-mysql row_event.go for mapping - switch val := val.(type) { - case nil: - return qvalue.QValueNull(qkind) - case int8: // TODO qvalue.Int8 - return qvalue.QValueInt16{Val: int16(val)} - case int16: - return qvalue.QValueInt16{Val: val} - case int32: - return qvalue.QValueInt32{Val: val} - case int64: - return qvalue.QValueInt64{Val: val} - case float32: - return qvalue.QValueFloat32{Val: val} - case float64: - return qvalue.QValueFloat64{Val: val} - case decimal.Decimal: - return qvalue.QValueNumeric{Val: val} - case int: - // YEAR: https://dev.mysql.com/doc/refman/8.4/en/year.html - return qvalue.QValueInt16{Val: int16(val)} - case time.Time: - return qvalue.QValueTimestamp{Val: val} - case *replication.JsonDiff: - // TODO support somehow?? - return qvalue.QValueNull(qvalue.QValueKindJSON) - case []byte: - switch mytype { - case mysql.MYSQL_TYPE_BLOB: - return qvalue.QValueBytes{Val: val} - case mysql.MYSQL_TYPE_JSON: - return qvalue.QValueJSON{Val: string(val)} - case mysql.MYSQL_TYPE_GEOMETRY: - // TODO figure out mysql geo encoding - return qvalue.QValueGeometry{Val: string(val)} - } - case string: - switch mytype { - case mysql.MYSQL_TYPE_TIME: - // TODO parse - case mysql.MYSQL_TYPE_TIME2: - // TODO parse - case mysql.MYSQL_TYPE_DATE: - // TODO parse - case mysql.MYSQL_TYPE_VARCHAR, - mysql.MYSQL_TYPE_VAR_STRING, - mysql.MYSQL_TYPE_STRING: - return qvalue.QValueString{Val: val} - } - } - panic(fmt.Sprintf("unexpected type %T for mysql type %d", val, mytype)) -} - func (c *MySqlConnector) PullRecords( ctx context.Context, catalogPool *pgxpool.Pool, @@ -372,7 +256,7 @@ func (c *MySqlConnector) PullRecords( items := model.NewRecordItems(len(row)) for idx, val := range row { fd := schema.Columns[idx] - items.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + items.AddColumn(fd.Name, qvalueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } recordCount += 1 @@ -392,13 +276,13 @@ func (c *MySqlConnector) PullRecords( oldItems := model.NewRecordItems(len(oldRow)) for idx, val := range oldRow { fd := schema.Columns[idx] - oldItems.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + oldItems.AddColumn(fd.Name, qvalueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } newRow := ev.Rows[idx+1] newItems := model.NewRecordItems(len(newRow)) for idx, val := range ev.Rows[idx+1] { fd := schema.Columns[idx] - newItems.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + newItems.AddColumn(fd.Name, qvalueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } recordCount += 1 @@ -417,7 +301,7 @@ func (c *MySqlConnector) PullRecords( items := model.NewRecordItems(len(row)) for idx, val := range row { fd := schema.Columns[idx] - items.AddColumn(fd.Name, qvalueFromMysql(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + items.AddColumn(fd.Name, qvalueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } recordCount += 1 @@ -440,3 +324,58 @@ func (c *MySqlConnector) PullRecords( } } } + +func qvalueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) qvalue.QValue { + // TODO signedness, in ev.Table, need to extend QValue system + // See go-mysql row_event.go for mapping + switch val := val.(type) { + case nil: + return qvalue.QValueNull(qkind) + case int8: // TODO qvalue.Int8 + return qvalue.QValueInt16{Val: int16(val)} + case int16: + return qvalue.QValueInt16{Val: val} + case int32: + return qvalue.QValueInt32{Val: val} + case int64: + return qvalue.QValueInt64{Val: val} + case float32: + return qvalue.QValueFloat32{Val: val} + case float64: + return qvalue.QValueFloat64{Val: val} + case decimal.Decimal: + return qvalue.QValueNumeric{Val: val} + case int: + // YEAR: https://dev.mysql.com/doc/refman/8.4/en/year.html + return qvalue.QValueInt16{Val: int16(val)} + case time.Time: + return qvalue.QValueTimestamp{Val: val} + case *replication.JsonDiff: + // TODO support somehow?? + return qvalue.QValueNull(qvalue.QValueKindJSON) + case []byte: + switch mytype { + case mysql.MYSQL_TYPE_BLOB: + return qvalue.QValueBytes{Val: val} + case mysql.MYSQL_TYPE_JSON: + return qvalue.QValueJSON{Val: string(val)} + case mysql.MYSQL_TYPE_GEOMETRY: + // TODO figure out mysql geo encoding + return qvalue.QValueGeometry{Val: string(val)} + } + case string: + switch mytype { + case mysql.MYSQL_TYPE_TIME: + // TODO parse + case mysql.MYSQL_TYPE_TIME2: + // TODO parse + case mysql.MYSQL_TYPE_DATE: + // TODO parse + case mysql.MYSQL_TYPE_VARCHAR, + mysql.MYSQL_TYPE_VAR_STRING, + mysql.MYSQL_TYPE_STRING: + return qvalue.QValueString{Val: val} + } + } + panic(fmt.Sprintf("unexpected type %T for mysql type %d", val, mytype)) +} diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index ca92989e0c..622f8c70de 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -5,6 +5,7 @@ package connmysql import ( "context" "crypto/tls" + "errors" "fmt" "log/slog" "time" @@ -16,6 +17,7 @@ import ( metadataStore "github.com/PeerDB-io/peerdb/flow/connectors/external_metadata" "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/model/qvalue" "github.com/PeerDB-io/peerdb/flow/shared" ) @@ -76,6 +78,7 @@ func (c *MySqlConnector) connect(ctx context.Context, options ...client.Option) func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interface{}) (*mysql.Result, error) { reconnects := 3 for { + // TODO need new connection if ctx changes between calls, or make upstream PR if c.conn == nil { var err error var argF []client.Option @@ -154,3 +157,116 @@ func (c *MySqlConnector) GetVersion(ctx context.Context) (string, error) { c.logger.Info("[mysql] version", slog.String("version", version)) return version, nil } + +func qkindFromMysql(ty uint8) (qvalue.QValueKind, error) { + switch ty { + case mysql.MYSQL_TYPE_DECIMAL: + return qvalue.QValueKindNumeric, nil + case mysql.MYSQL_TYPE_TINY: + return qvalue.QValueKindInt16, nil // TODO qvalue.QValueKindInt8 + case mysql.MYSQL_TYPE_SHORT: + return qvalue.QValueKindInt16, nil + case mysql.MYSQL_TYPE_LONG: + return qvalue.QValueKindInt32, nil + case mysql.MYSQL_TYPE_FLOAT: + return qvalue.QValueKindFloat32, nil + case mysql.MYSQL_TYPE_DOUBLE: + return qvalue.QValueKindFloat64, nil + case mysql.MYSQL_TYPE_NULL: + return qvalue.QValueKindInvalid, nil // TODO qvalue.QValueKindNothing + case mysql.MYSQL_TYPE_TIMESTAMP: + return qvalue.QValueKindTimestamp, nil + case mysql.MYSQL_TYPE_LONGLONG: + return qvalue.QValueKindInt64, nil + case mysql.MYSQL_TYPE_INT24: + return qvalue.QValueKindInt32, nil + case mysql.MYSQL_TYPE_DATE: + return qvalue.QValueKindDate, nil + case mysql.MYSQL_TYPE_TIME: + return qvalue.QValueKindTime, nil + case mysql.MYSQL_TYPE_DATETIME: + return qvalue.QValueKindTimestamp, nil + case mysql.MYSQL_TYPE_YEAR: + return qvalue.QValueKindInt16, nil + case mysql.MYSQL_TYPE_NEWDATE: + return qvalue.QValueKindDate, nil + case mysql.MYSQL_TYPE_VARCHAR: + return qvalue.QValueKindString, nil + case mysql.MYSQL_TYPE_BIT: + return qvalue.QValueKindInt64, nil + case mysql.MYSQL_TYPE_TIMESTAMP2: + return qvalue.QValueKindTimestamp, nil + case mysql.MYSQL_TYPE_DATETIME2: + return qvalue.QValueKindTimestamp, nil + case mysql.MYSQL_TYPE_TIME2: + return qvalue.QValueKindTime, nil + case mysql.MYSQL_TYPE_JSON: + return qvalue.QValueKindJSON, nil + case mysql.MYSQL_TYPE_NEWDECIMAL: + return qvalue.QValueKindNumeric, nil + case mysql.MYSQL_TYPE_ENUM: + return qvalue.QValueKindInt64, nil + case mysql.MYSQL_TYPE_SET: + return qvalue.QValueKindInt64, nil + case mysql.MYSQL_TYPE_TINY_BLOB: + return qvalue.QValueKindBytes, nil + case mysql.MYSQL_TYPE_MEDIUM_BLOB: + return qvalue.QValueKindBytes, nil + case mysql.MYSQL_TYPE_LONG_BLOB: + return qvalue.QValueKindBytes, nil + case mysql.MYSQL_TYPE_BLOB: + return qvalue.QValueKindBytes, nil + case mysql.MYSQL_TYPE_VAR_STRING: + return qvalue.QValueKindString, nil + case mysql.MYSQL_TYPE_STRING: + return qvalue.QValueKindString, nil + case mysql.MYSQL_TYPE_GEOMETRY: + return qvalue.QValueKindGeometry, nil + default: + return qvalue.QValueKind(""), fmt.Errorf("unknown mysql type %d", ty) + } +} + +func qvalueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qvalue.QValue, error) { + // TODO fill this in, maybe contribute upstream, figvure out how numeric etc fit in + switch v := fv.Value().(type) { + case nil: + return qvalue.QValueNull(qkind), nil + case uint64: + // TODO unsigned integers + return nil, errors.New("mysql unsigned integers not supported") + case int64: + switch qkind { + case qvalue.QValueKindInt16: + return qvalue.QValueInt16{Val: int16(v)}, nil + case qvalue.QValueKindInt32: + return qvalue.QValueInt32{Val: int32(v)}, nil + case qvalue.QValueKindInt64: + return qvalue.QValueInt64{Val: v}, nil + default: + return nil, fmt.Errorf("cannot convert int to %s", qkind) + } + case float64: + switch qkind { + case qvalue.QValueKindFloat32: + return qvalue.QValueFloat32{Val: float32(v)}, nil + case qvalue.QValueKindFloat64: + return qvalue.QValueFloat64{Val: float64(v)}, nil + default: + return nil, fmt.Errorf("cannot convert float to %s", qkind) + } + case string: + switch qkind { + case qvalue.QValueKindString: + return qvalue.QValueString{Val: v}, nil + case qvalue.QValueKindBytes: + return qvalue.QValueBytes{Val: []byte(v)}, nil + case qvalue.QValueKindJSON: + return qvalue.QValueJSON{Val: v}, nil + default: + return nil, fmt.Errorf("cannot convert string to %s", qkind) + } + default: + return nil, fmt.Errorf("unexpected mysql type %T", v) + } +} diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go new file mode 100644 index 0000000000..9ac4b57158 --- /dev/null +++ b/flow/connectors/mysql/qrep.go @@ -0,0 +1,239 @@ +package connmysql + +import ( + "bytes" + "context" + "errors" + "fmt" + "log/slog" + "text/template" + + "github.com/go-mysql-org/go-mysql/mysql" + "github.com/google/uuid" + "go.temporal.io/sdk/log" + + utils "github.com/PeerDB-io/peer-flow/connectors/utils/partition" + "github.com/PeerDB-io/peer-flow/generated/protos" + "github.com/PeerDB-io/peer-flow/model" + "github.com/PeerDB-io/peer-flow/model/qvalue" +) + +func (c *MySqlConnector) GetQRepPartitions( + ctx context.Context, + config *protos.QRepConfig, + last *protos.QRepPartition, +) ([]*protos.QRepPartition, error) { + if config.WatermarkTable == "" { + c.logger.Info("watermark table is empty, doing full table refresh") + return []*protos.QRepPartition{ + { + PartitionId: uuid.New().String(), + FullTablePartition: true, + }, + }, nil + } + + if config.NumRowsPerPartition <= 0 { + return nil, errors.New("num rows per partition must be greater than 0 for sql server") + } + + var err error + numRowsPerPartition := int64(config.NumRowsPerPartition) + quotedWatermarkColumn := fmt.Sprintf("\"%s\"", config.WatermarkColumn) + + whereClause := "" + if last != nil && last.Range != nil { + whereClause = fmt.Sprintf("WHERE %s > $1", quotedWatermarkColumn) + } + + // Query to get the total number of rows in the table + countQuery := fmt.Sprintf("SELECT COUNT(*) FROM %s %s", config.WatermarkTable, whereClause) + var minVal interface{} + var totalRows int64 + if last != nil && last.Range != nil { + switch lastRange := last.Range.Range.(type) { + case *protos.PartitionRange_IntRange: + minVal = lastRange.IntRange.End + case *protos.PartitionRange_TimestampRange: + minVal = lastRange.TimestampRange.End.AsTime() + } + c.logger.Info(fmt.Sprintf("count query: %s - minVal: %v", countQuery, minVal)) + + rs, err := c.Execute(ctx, countQuery, minVal) + if err != nil { + return nil, err + } + + totalRows, err = rs.GetInt(0, 0) + if err != nil { + return nil, fmt.Errorf("failed to query for total rows: %w", err) + } + } else { + rs, err := c.Execute(ctx, countQuery) + if err != nil { + return nil, err + } + + totalRows, err = rs.GetInt(0, 0) + if err != nil { + return nil, fmt.Errorf("failed to query for total rows: %w", err) + } + } + + if totalRows == 0 { + c.logger.Warn("no records to replicate, returning") + return make([]*protos.QRepPartition, 0), nil + } + + // Calculate the number of partitions + numPartitions := totalRows / numRowsPerPartition + if totalRows%numRowsPerPartition != 0 { + numPartitions++ + } + c.logger.Info(fmt.Sprintf("total rows: %d, num partitions: %d, num rows per partition: %d", + totalRows, numPartitions, numRowsPerPartition)) + var rs *mysql.Result + if minVal != nil { + // Query to get partitions using window functions + partitionsQuery := fmt.Sprintf( + `SELECT bucket_v, MIN(v_from) AS start_v, MAX(v_from) AS end_v + FROM ( + SELECT NTILE(%d) OVER (ORDER BY %s) AS bucket_v, %s as v_from + FROM %s WHERE %s > $1 + ) AS subquery + GROUP BY bucket_v + ORDER BY start_v`, + numPartitions, + quotedWatermarkColumn, + quotedWatermarkColumn, + config.WatermarkTable, + quotedWatermarkColumn, + ) + c.logger.Info(fmt.Sprintf("partitions query: %s - minVal: %v", partitionsQuery, minVal)) + rs, err = c.Execute(ctx, partitionsQuery, minVal) + } else { + partitionsQuery := fmt.Sprintf( + `SELECT bucket_v, MIN(v_from) AS start_v, MAX(v_from) AS end_v + FROM ( + SELECT NTILE(%d) OVER (ORDER BY %s) AS bucket_v, %s as v_from + FROM %s + ) AS subquery + GROUP BY bucket_v + ORDER BY start_v`, + numPartitions, + quotedWatermarkColumn, + quotedWatermarkColumn, + config.WatermarkTable, + ) + c.logger.Info("partitions query: " + partitionsQuery) + rs, err = c.Execute(ctx, partitionsQuery) + } + if err != nil { + return nil, fmt.Errorf("failed to query for partitions: %w", err) + } + + partitionHelper := utils.NewPartitionHelper() + for _, row := range rs.Values { + if err := partitionHelper.AddPartition(row[1].Value(), row[2].Value()); err != nil { + return nil, fmt.Errorf("failed to add partition: %w", err) + } + } + + return partitionHelper.GetPartitions(), nil +} + +// TODO use ExecuteStreamingSelect +func (c *MySqlConnector) PullQRepRecords( + ctx context.Context, + config *protos.QRepConfig, + last *protos.QRepPartition, + stream *model.QRecordStream, +) (int, error) { + // Build the query to pull records within the range from the source table + // Be sure to order the results by the watermark column to ensure consistency across pulls + query, err := BuildQuery(c.logger, config.Query) + if err != nil { + return 0, err + } + + var rs *mysql.Result + if last.FullTablePartition { + var err error + // this is a full table partition, so just run the query + rs, err = c.Execute(ctx, query) + if err != nil { + return 0, err + } + } else { + var rangeStart interface{} + var rangeEnd interface{} + + // Depending on the type of the range, convert the range into the correct type + switch x := last.Range.Range.(type) { + case *protos.PartitionRange_IntRange: + rangeStart = x.IntRange.Start + rangeEnd = x.IntRange.End + case *protos.PartitionRange_TimestampRange: + rangeStart = x.TimestampRange.Start.AsTime() + rangeEnd = x.TimestampRange.End.AsTime() + default: + return 0, fmt.Errorf("unknown range type: %v", x) + } + + var err error + rs, err = c.Execute(ctx, query, rangeStart, rangeEnd) + if err != nil { + return 0, err + } + } + + schema := make([]qvalue.QField, 0, len(rs.Fields)) + for _, field := range rs.Fields { + qkind, err := qkindFromMysql(field.Type) + if err != nil { + return 0, err + } + + schema = append(schema, qvalue.QField{ + Name: string(field.Name), + Type: qkind, + Precision: 0, // TODO numerics + Scale: 0, // TODO numerics + Nullable: (field.Flag & mysql.NOT_NULL_FLAG) == 0, + }) + } + stream.SetSchema(qvalue.QRecordSchema{Fields: schema}) + for _, row := range rs.Values { + record := make([]qvalue.QValue, 0, len(row)) + for idx, val := range row { + qv, err := qvalueFromMysqlFieldValue(schema[idx].Type, val) + if err != nil { + return 0, err + } + record = append(record, qv) + } + stream.Records <- record + } + return len(rs.Values), nil +} + +func BuildQuery(logger log.Logger, query string) (string, error) { + tmpl, err := template.New("query").Parse(query) + if err != nil { + return "", err + } + + data := map[string]interface{}{ + "start": "$1", + "end": "$2", + } + + buf := new(bytes.Buffer) + if err := tmpl.Execute(buf, data); err != nil { + return "", err + } + res := buf.String() + + logger.Info("[mysql] templated query", slog.String("query", res)) + return res, nil +} diff --git a/flow/connectors/sqlserver/qrep.go b/flow/connectors/sqlserver/qrep.go index 6212a85ab0..c3323a68b6 100644 --- a/flow/connectors/sqlserver/qrep.go +++ b/flow/connectors/sqlserver/qrep.go @@ -46,7 +46,7 @@ func (c *SQLServerConnector) GetQRepPartitions( // Query to get the total number of rows in the table countQuery := fmt.Sprintf("SELECT COUNT(*) FROM %s %s", config.WatermarkTable, whereClause) - var minVal interface{} = nil + var minVal interface{} var totalRows pgtype.Int8 if last != nil && last.Range != nil { switch lastRange := last.Range.Range.(type) { @@ -78,11 +78,8 @@ func (c *SQLServerConnector) GetQRepPartitions( if err != nil { return nil, fmt.Errorf("failed to query for total rows: %w", err) } - } else { - row := c.db.QueryRowContext(ctx, countQuery) - if err = row.Scan(&totalRows); err != nil { - return nil, fmt.Errorf("failed to query for total rows: %w", err) - } + } else if err := c.db.QueryRowContext(ctx, countQuery).Scan(&totalRows); err != nil { + return nil, fmt.Errorf("failed to query for total rows: %w", err) } if totalRows.Int64 == 0 { @@ -150,8 +147,7 @@ func (c *SQLServerConnector) GetQRepPartitions( return nil, fmt.Errorf("failed to scan row: %w", err) } - err = partitionHelper.AddPartition(start, end) - if err != nil { + if err := partitionHelper.AddPartition(start, end); err != nil { return nil, fmt.Errorf("failed to add partition: %w", err) } } @@ -162,7 +158,7 @@ func (c *SQLServerConnector) GetQRepPartitions( func (c *SQLServerConnector) PullQRepRecords( ctx context.Context, config *protos.QRepConfig, - partition *protos.QRepPartition, + last *protos.QRepPartition, stream *model.QRecordStream, ) (int, error) { // Build the query to pull records within the range from the source table @@ -172,40 +168,40 @@ func (c *SQLServerConnector) PullQRepRecords( return 0, err } - if partition.FullTablePartition { + var qbatch *model.QRecordBatch + if last.FullTablePartition { // this is a full table partition, so just run the query - qbatch, err := c.ExecuteAndProcessQuery(ctx, query) + var err error + qbatch, err = c.ExecuteAndProcessQuery(ctx, query) if err != nil { return 0, err } - qbatch.FeedToQRecordStream(stream) - return len(qbatch.Records), nil - } + } else { + var rangeStart interface{} + var rangeEnd interface{} - var rangeStart interface{} - var rangeEnd interface{} - - // Depending on the type of the range, convert the range into the correct type - switch x := partition.Range.Range.(type) { - case *protos.PartitionRange_IntRange: - rangeStart = x.IntRange.Start - rangeEnd = x.IntRange.End - case *protos.PartitionRange_TimestampRange: - rangeStart = x.TimestampRange.Start.AsTime() - rangeEnd = x.TimestampRange.End.AsTime() - default: - return 0, fmt.Errorf("unknown range type: %v", x) - } + // Depending on the type of the range, convert the range into the correct type + switch x := last.Range.Range.(type) { + case *protos.PartitionRange_IntRange: + rangeStart = x.IntRange.Start + rangeEnd = x.IntRange.End + case *protos.PartitionRange_TimestampRange: + rangeStart = x.TimestampRange.Start.AsTime() + rangeEnd = x.TimestampRange.End.AsTime() + default: + return 0, fmt.Errorf("unknown range type: %v", x) + } - rangeParams := map[string]interface{}{ - "startRange": rangeStart, - "endRange": rangeEnd, + var err error + qbatch, err = c.NamedExecuteAndProcessQuery(ctx, query, map[string]interface{}{ + "startRange": rangeStart, + "endRange": rangeEnd, + }) + if err != nil { + return 0, err + } } - qbatch, err := c.NamedExecuteAndProcessQuery(ctx, query, rangeParams) - if err != nil { - return 0, err - } qbatch.FeedToQRecordStream(stream) return len(qbatch.Records), nil } From f467e1417b9bf39d15dc7bad94c081e65e1bfd25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 2 Jan 2025 15:54:36 +0000 Subject: [PATCH 21/80] cleanup --- flow/workflows/snapshot_flow.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/flow/workflows/snapshot_flow.go b/flow/workflows/snapshot_flow.go index 908a78ba79..bc43f5ad30 100644 --- a/flow/workflows/snapshot_flow.go +++ b/flow/workflows/snapshot_flow.go @@ -62,8 +62,7 @@ func (s *SnapshotFlowExecution) setupReplication( } res := &protos.SetupReplicationOutput{} - setupReplicationFuture := workflow.ExecuteActivity(ctx, snapshot.SetupReplication, setupReplicationInput) - if err := setupReplicationFuture.Get(ctx, &res); err != nil { + if err := workflow.ExecuteActivity(ctx, snapshot.SetupReplication, setupReplicationInput).Get(ctx, &res); err != nil { return nil, fmt.Errorf("failed to setup replication on source peer: %w", err) } @@ -257,8 +256,7 @@ func (s *SnapshotFlowExecution) cloneTables( if v.PartitionKey == "" { v.PartitionKey = defaultPartitionCol } - err := s.cloneTable(ctx, boundSelector, snapshotName, v) - if err != nil { + if err := s.cloneTable(ctx, boundSelector, snapshotName, v); err != nil { s.logger.Error("failed to start clone child workflow", slog.Any("error", err)) continue } From d2cea38690c910be09bc5176b0ce0b863e2395cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 3 Jan 2025 16:31:18 +0000 Subject: [PATCH 22/80] fix partition query with pg improvements, fix full table refresh check for sqlserver & mysql --- flow/connectors/mysql/qrep.go | 40 +++++++++----------- flow/connectors/postgres/qrep.go | 29 +++++++------- flow/connectors/sqlserver/qrep.go | 32 +++++++--------- flow/connectors/utils/partition/partition.go | 2 +- flow/workflows/qrep_flow.go | 16 +++----- 5 files changed, 53 insertions(+), 66 deletions(-) diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 9ac4b57158..f4651e938d 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -23,8 +23,8 @@ func (c *MySqlConnector) GetQRepPartitions( config *protos.QRepConfig, last *protos.QRepPartition, ) ([]*protos.QRepPartition, error) { - if config.WatermarkTable == "" { - c.logger.Info("watermark table is empty, doing full table refresh") + if config.WatermarkColumn == "" { + // if no watermark column is specified, return a single partition return []*protos.QRepPartition{ { PartitionId: uuid.New().String(), @@ -39,7 +39,7 @@ func (c *MySqlConnector) GetQRepPartitions( var err error numRowsPerPartition := int64(config.NumRowsPerPartition) - quotedWatermarkColumn := fmt.Sprintf("\"%s\"", config.WatermarkColumn) + quotedWatermarkColumn := fmt.Sprintf("`%s`", config.WatermarkColumn) whereClause := "" if last != nil && last.Range != nil { @@ -96,36 +96,32 @@ func (c *MySqlConnector) GetQRepPartitions( if minVal != nil { // Query to get partitions using window functions partitionsQuery := fmt.Sprintf( - `SELECT bucket_v, MIN(v_from) AS start_v, MAX(v_from) AS end_v - FROM ( - SELECT NTILE(%d) OVER (ORDER BY %s) AS bucket_v, %s as v_from - FROM %s WHERE %s > $1 - ) AS subquery - GROUP BY bucket_v - ORDER BY start_v`, + `SELECT bucket, MIN(%[2]s) AS start, MAX(%[2]s) AS end + FROM ( + SELECT NTILE(%[1]d) OVER (ORDER BY %[2]s) AS bucket, %[2]s + FROM %[3]s WHERE %[2]s > $1 + ) AS subquery + GROUP BY bucket + ORDER BY start`, numPartitions, quotedWatermarkColumn, - quotedWatermarkColumn, config.WatermarkTable, - quotedWatermarkColumn, ) - c.logger.Info(fmt.Sprintf("partitions query: %s - minVal: %v", partitionsQuery, minVal)) + c.logger.Info("partitions query", slog.String("query", partitionsQuery), slog.Any("minVal", minVal)) rs, err = c.Execute(ctx, partitionsQuery, minVal) } else { partitionsQuery := fmt.Sprintf( - `SELECT bucket_v, MIN(v_from) AS start_v, MAX(v_from) AS end_v - FROM ( - SELECT NTILE(%d) OVER (ORDER BY %s) AS bucket_v, %s as v_from - FROM %s - ) AS subquery - GROUP BY bucket_v - ORDER BY start_v`, + `SELECT bucket_v, MIN(%[2]s) AS start, MAX(%[2]s) AS end + FROM ( + SELECT NTILE(%[1]d) OVER (ORDER BY %[2]s) AS bucket, %[2]s FROM %[3]s + ) AS subquery + GROUP BY bucket + ORDER BY start`, numPartitions, quotedWatermarkColumn, - quotedWatermarkColumn, config.WatermarkTable, ) - c.logger.Info("partitions query: " + partitionsQuery) + c.logger.Info("partitions query", slog.String("query", partitionsQuery)) rs, err = c.Execute(ctx, partitionsQuery) } if err != nil { diff --git a/flow/connectors/postgres/qrep.go b/flow/connectors/postgres/qrep.go index 22fb456ea4..071234b2cb 100644 --- a/flow/connectors/postgres/qrep.go +++ b/flow/connectors/postgres/qrep.go @@ -43,12 +43,13 @@ func (c *PostgresConnector) GetQRepPartitions( ) ([]*protos.QRepPartition, error) { if config.WatermarkColumn == "" { // if no watermark column is specified, return a single partition - partition := &protos.QRepPartition{ - PartitionId: uuid.New().String(), - FullTablePartition: true, - Range: nil, - } - return []*protos.QRepPartition{partition}, nil + return []*protos.QRepPartition{ + { + PartitionId: uuid.New().String(), + FullTablePartition: true, + Range: nil, + }, + }, nil } // begin a transaction @@ -135,32 +136,30 @@ func (c *PostgresConnector) getNumRowsPartitions( partitionsQuery := fmt.Sprintf( `SELECT bucket, MIN(%[2]s) AS start, MAX(%[2]s) AS end FROM ( - SELECT NTILE(%[1]d) OVER (ORDER BY %[2]s) AS bucket, %[2]s - FROM %[3]s WHERE %[2]s > $1 + SELECT NTILE(%[1]d) OVER (ORDER BY %[2]s) AS bucket, %[2]s + FROM %[3]s WHERE %[2]s > $1 ) subquery GROUP BY bucket - ORDER BY start - `, + ORDER BY start`, numPartitions, quotedWatermarkColumn, parsedWatermarkTable.String(), ) - c.logger.Info("[row_based_next] partitions query: " + partitionsQuery) + c.logger.Info("[row_based_next] partitions query", slog.String("query", partitionsQuery)) rows, err = tx.Query(ctx, partitionsQuery, minVal) } else { partitionsQuery := fmt.Sprintf( `SELECT bucket, MIN(%[2]s) AS start, MAX(%[2]s) AS end FROM ( - SELECT NTILE(%[1]d) OVER (ORDER BY %[2]s) AS bucket, %[2]s FROM %[3]s + SELECT NTILE(%[1]d) OVER (ORDER BY %[2]s) AS bucket, %[2]s FROM %[3]s ) subquery GROUP BY bucket - ORDER BY start - `, + ORDER BY start`, numPartitions, quotedWatermarkColumn, parsedWatermarkTable.String(), ) - c.logger.Info("[row_based] partitions query: " + partitionsQuery) + c.logger.Info("[row_based] partitions query", slog.String("query", partitionsQuery)) rows, err = tx.Query(ctx, partitionsQuery) } if err != nil { diff --git a/flow/connectors/sqlserver/qrep.go b/flow/connectors/sqlserver/qrep.go index c3323a68b6..9ed3d5c6f5 100644 --- a/flow/connectors/sqlserver/qrep.go +++ b/flow/connectors/sqlserver/qrep.go @@ -22,7 +22,7 @@ func (c *SQLServerConnector) GetQRepPartitions( ctx context.Context, config *protos.QRepConfig, last *protos.QRepPartition, ) ([]*protos.QRepPartition, error) { if config.WatermarkTable == "" { - c.logger.Info("watermark table is empty, doing full table refresh") + // if no watermark column is specified, return a single partition return []*protos.QRepPartition{ { PartitionId: uuid.New().String(), @@ -98,18 +98,16 @@ func (c *SQLServerConnector) GetQRepPartitions( if minVal != nil { // Query to get partitions using window functions partitionsQuery := fmt.Sprintf( - `SELECT bucket_v, MIN(v_from) AS start_v, MAX(v_from) AS end_v - FROM ( - SELECT NTILE(%d) OVER (ORDER BY %s) AS bucket_v, %s as v_from - FROM %s WHERE %s > :minVal - ) AS subquery - GROUP BY bucket_v - ORDER BY start_v`, + `SELECT bucket, MIN(%[2]s) AS start_v, MAX(%[2]s) AS end_v + FROM ( + SELECT NTILE(%[1]d) OVER (ORDER BY %s) AS bucket, %[2]s + FROM %[3]s WHERE %[2]s > :minVal + ) AS subquery + GROUP BY bucket + ORDER BY start_v`, numPartitions, quotedWatermarkColumn, - quotedWatermarkColumn, config.WatermarkTable, - quotedWatermarkColumn, ) c.logger.Info(fmt.Sprintf("partitions query: %s - minVal: %v", partitionsQuery, minVal)) params := map[string]interface{}{ @@ -118,16 +116,14 @@ func (c *SQLServerConnector) GetQRepPartitions( rows, err = c.db.NamedQuery(partitionsQuery, params) } else { partitionsQuery := fmt.Sprintf( - `SELECT bucket_v, MIN(v_from) AS start_v, MAX(v_from) AS end_v - FROM ( - SELECT NTILE(%d) OVER (ORDER BY %s) AS bucket_v, %s as v_from - FROM %s - ) AS subquery - GROUP BY bucket_v - ORDER BY start_v`, + `SELECT bucket, MIN(%[2]s) AS start_v, MAX(%[2]s) AS end_v + FROM ( + SELECT NTILE(%[1]d) OVER (ORDER BY %[2]s) AS bucket, %[2]s FROM %[3]s + ) AS subquery + GROUP BY bucket + ORDER BY start_v`, numPartitions, quotedWatermarkColumn, - quotedWatermarkColumn, config.WatermarkTable, ) c.logger.Info("partitions query: " + partitionsQuery) diff --git a/flow/connectors/utils/partition/partition.go b/flow/connectors/utils/partition/partition.go index a8e5bfac16..4a83df0aa3 100644 --- a/flow/connectors/utils/partition/partition.go +++ b/flow/connectors/utils/partition/partition.go @@ -132,7 +132,7 @@ func NewPartitionHelper() *PartitionHelper { } func (p *PartitionHelper) AddPartition(start interface{}, end interface{}) error { - slog.Debug(fmt.Sprintf("adding partition - start: %v, end: %v", start, end)) + slog.Info(fmt.Sprintf("adding partition - start: %v, end: %v", start, end)) // Skip partition if it's fully contained within the previous one // If it's not fully contained but overlaps, adjust the start diff --git a/flow/workflows/qrep_flow.go b/flow/workflows/qrep_flow.go index 87555371a7..14d82cfdf4 100644 --- a/flow/workflows/qrep_flow.go +++ b/flow/workflows/qrep_flow.go @@ -187,9 +187,8 @@ func (q *QRepFlowExecution) getPartitions( }, }) - partitionsFuture := workflow.ExecuteActivity(ctx, flowable.GetQRepPartitions, q.config, last, q.runUUID) var partitions *protos.QRepParitionResult - if err := partitionsFuture.Get(ctx, &partitions); err != nil { + if err := workflow.ExecuteActivity(ctx, flowable.GetQRepPartitions, q.config, last, q.runUUID).Get(ctx, &partitions); err != nil { return nil, fmt.Errorf("failed to fetch partitions to replicate: %w", err) } @@ -213,8 +212,7 @@ func (q *QRepPartitionFlowExecution) replicatePartitions(ctx workflow.Context, }, }) - msg := fmt.Sprintf("replicating partition batch - %d", partitions.BatchId) - q.logger.Info(msg) + q.logger.Info("replicating partition batch", slog.Int64("BatchID", int64(partitions.BatchId))) if err := workflow.ExecuteActivity(ctx, flowable.ReplicateQRepPartitions, q.config, partitions, q.runUUID).Get(ctx, nil); err != nil { return fmt.Errorf("failed to replicate partition: %w", err) @@ -424,18 +422,16 @@ func (q *QRepFlowExecution) handleTableRenameForResync(ctx workflow.Context, sta func setWorkflowQueries(ctx workflow.Context, state *protos.QRepFlowState) error { // Support a Query for the current state of the qrep flow. - err := workflow.SetQueryHandler(ctx, shared.QRepFlowStateQuery, func() (*protos.QRepFlowState, error) { + if err := workflow.SetQueryHandler(ctx, shared.QRepFlowStateQuery, func() (*protos.QRepFlowState, error) { return state, nil - }) - if err != nil { + }); err != nil { return fmt.Errorf("failed to set `%s` query handler: %w", shared.QRepFlowStateQuery, err) } // Support a Query for the current status of the qrep flow. - err = workflow.SetQueryHandler(ctx, shared.FlowStatusQuery, func() (protos.FlowStatus, error) { + if err := workflow.SetQueryHandler(ctx, shared.FlowStatusQuery, func() (protos.FlowStatus, error) { return state.CurrentFlowStatus, nil - }) - if err != nil { + }); err != nil { return fmt.Errorf("failed to set `%s` query handler: %w", shared.FlowStatusQuery, err) } From 09a4f65eed5ccb96ae020af5d6cb8541401348a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 3 Jan 2025 17:05:15 +0000 Subject: [PATCH 23/80] set sql_mode = ANSI. It was this or adding template parameter for table identifier --- flow/connectors/mysql/mysql.go | 1 + flow/middleware/oauth.go | 2 +- flow/workflows/qrep_flow.go | 8 +++----- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 622f8c70de..25d90be4d3 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -92,6 +92,7 @@ func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interf if err != nil { return nil, fmt.Errorf("failed to connect to mysql server: %w", err) } + c.conn.Execute("SET sql_mode = ANSI") } rs, err := c.conn.Execute(cmd, args...) diff --git a/flow/middleware/oauth.go b/flow/middleware/oauth.go index 60e965fe6f..bb488be4f8 100644 --- a/flow/middleware/oauth.go +++ b/flow/middleware/oauth.go @@ -83,7 +83,7 @@ func AuthGrpcMiddleware(unauthenticatedMethods []string) (grpc.UnaryServerInterc _, err := validateRequestToken(authHeader, cfg.OauthJwtCustomClaims, ip...) if err != nil { slog.Debug("Failed to validate request token", slog.String("method", info.FullMethod), slog.Any("error", err)) - return nil, status.Errorf(codes.Unauthenticated, "%s", err.Error()) + return nil, status.Error(codes.Unauthenticated, err.Error()) } } diff --git a/flow/workflows/qrep_flow.go b/flow/workflows/qrep_flow.go index 14d82cfdf4..72e83a3f20 100644 --- a/flow/workflows/qrep_flow.go +++ b/flow/workflows/qrep_flow.go @@ -223,8 +223,7 @@ func (q *QRepPartitionFlowExecution) replicatePartitions(ctx workflow.Context, // getPartitionWorkflowID returns the child workflow ID for a new sync flow. func (q *QRepFlowExecution) getPartitionWorkflowID(ctx workflow.Context) string { - id := GetUUID(ctx) - return fmt.Sprintf("qrep-part-%s-%s", q.config.FlowJobName, id) + return fmt.Sprintf("qrep-part-%s-%s", q.config.FlowJobName, GetUUID(ctx)) } // startChildWorkflow starts a single child workflow. @@ -266,11 +265,10 @@ func (q *QRepFlowExecution) processPartitions( partitionWorkflows := make([]workflow.Future, 0, len(batches)) for i, parts := range batches { - batch := &protos.QRepPartitionBatch{ + future := q.startChildWorkflow(ctx, &protos.QRepPartitionBatch{ Partitions: parts, BatchId: int32(i + 1), - } - future := q.startChildWorkflow(ctx, batch) + }) partitionWorkflows = append(partitionWorkflows, future) } From b338d6891a7b97ba5a1ec8e901932877ab93f032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 3 Jan 2025 21:13:18 +0000 Subject: [PATCH 24/80] fixes --- flow/connectors/mysql/mysql.go | 8 ++++---- flow/connectors/mysql/qrep.go | 1 + flow/workflows/snapshot_flow.go | 27 +++++++++++++-------------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 25d90be4d3..56bc9369c1 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -256,14 +256,14 @@ func qvalueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qv default: return nil, fmt.Errorf("cannot convert float to %s", qkind) } - case string: + case []byte: switch qkind { case qvalue.QValueKindString: - return qvalue.QValueString{Val: v}, nil + return qvalue.QValueString{Val: string(v)}, nil case qvalue.QValueKindBytes: - return qvalue.QValueBytes{Val: []byte(v)}, nil + return qvalue.QValueBytes{Val: v}, nil case qvalue.QValueKindJSON: - return qvalue.QValueJSON{Val: v}, nil + return qvalue.QValueJSON{Val: string(v)}, nil default: return nil, fmt.Errorf("cannot convert string to %s", qkind) } diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index f4651e938d..319c2c7a0c 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -210,6 +210,7 @@ func (c *MySqlConnector) PullQRepRecords( } stream.Records <- record } + close(stream.Records) return len(rs.Values), nil } diff --git a/flow/workflows/snapshot_flow.go b/flow/workflows/snapshot_flow.go index bc43f5ad30..a1425358d6 100644 --- a/flow/workflows/snapshot_flow.go +++ b/flow/workflows/snapshot_flow.go @@ -316,31 +316,30 @@ func SnapshotFlowWorkflow( numTablesInParallel := int(max(config.SnapshotNumTablesInParallel, 1)) - if !config.DoInitialSnapshot { - _, err := se.setupReplication(ctx) - if err != nil { - return fmt.Errorf("failed to setup replication: %w", err) - } - - if err := se.closeSlotKeepAlive(ctx); err != nil { - return fmt.Errorf("failed to close slot keep alive: %w", err) - } - - return nil - } - sessionOpts := &workflow.SessionOptions{ CreationTimeout: 5 * time.Minute, ExecutionTimeout: time.Hour * 24 * 365 * 100, // 100 years HeartbeatTimeout: time.Hour, } - sessionCtx, err := workflow.CreateSession(ctx, sessionOpts) if err != nil { return fmt.Errorf("failed to create session: %w", err) } defer workflow.CompleteSession(sessionCtx) + if !config.DoInitialSnapshot { + _, err := se.setupReplication(sessionCtx) + if err != nil { + return fmt.Errorf("failed to setup replication: %w", err) + } + + if err := se.closeSlotKeepAlive(sessionCtx); err != nil { + return fmt.Errorf("failed to close slot keep alive: %w", err) + } + + return nil + } + if config.InitialSnapshotOnly { sessionInfo := workflow.GetSessionInfo(sessionCtx) From b28325fc68597df49b9307219deccef97f749624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 3 Jan 2025 21:46:04 +0000 Subject: [PATCH 25/80] syncer gets recreated each batch --- flow/activities/flowable_core.go | 6 +-- flow/connectors/mysql/cdc.go | 38 ++++++++++++++----- flow/connectors/mysql/mysql.go | 20 ++-------- .../postgres/qrep_partition_test.go | 3 +- 4 files changed, 35 insertions(+), 32 deletions(-) diff --git a/flow/activities/flowable_core.go b/flow/activities/flowable_core.go index 803c31fa96..28ee6300d2 100644 --- a/flow/activities/flowable_core.go +++ b/flow/activities/flowable_core.go @@ -279,7 +279,7 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon if temporal.IsApplicationError(err) { return nil, err } else { - return nil, fmt.Errorf("failed to pull records: %w", err) + return nil, fmt.Errorf("[cdc] failed to pull records: %w", err) } } syncState.Store(shared.Ptr("bookkeeping")) @@ -288,7 +288,7 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon lastCheckpoint := recordBatchSync.GetLastCheckpoint() if err := srcConn.UpdateReplStateLastOffset(ctx, lastCheckpoint); err != nil { a.Alerter.LogFlowError(ctx, flowName, err) - return 0, err + return nil, err } if err := monitoring.UpdateNumRowsAndEndLSNForCDCBatch( @@ -455,7 +455,7 @@ func replicateQRepPartition[TRead any, TWrite StreamCloser, TSync connectors.QRe tmp, err := pullRecords(srcConn, errCtx, config, partition, stream) if err != nil { a.Alerter.LogFlowError(ctx, config.FlowJobName, err) - return fmt.Errorf("failed to pull records: %w", err) + return fmt.Errorf("[qrep] failed to pull records: %w", err) } numRecords := int64(tmp) if err := monitoring.UpdatePullEndTimeAndRowsForPartition( diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index d8ce16753a..99537eb62e 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "log/slog" + "math/rand/v2" "time" "github.com/go-mysql-org/go-mysql/mysql" @@ -131,18 +132,34 @@ func (c *MySqlConnector) SetupReplConn(ctx context.Context) error { return nil } -//nolint:unused -func (c *MySqlConnector) startCdcStreamingFilePos(lastOffsetName string, lastOffsetPos uint32) (*replication.BinlogStreamer, error) { - return c.syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) +func (c *MySqlConnector) startSyncer() *replication.BinlogSyncer { + //nolint:gosec + return replication.NewBinlogSyncer(replication.BinlogSyncerConfig{ + ServerID: rand.Uint32(), + Flavor: c.config.Flavor, + Host: c.config.Host, + Port: uint16(c.config.Port), + User: c.config.User, + Password: c.config.Password, + UseDecimal: true, + ParseTime: true, + }) } -func (c *MySqlConnector) startCdcStreamingGtid(gset mysql.GTIDSet) (*replication.BinlogStreamer, error) { - // https://hevodata.com/learn/mysql-gtids-and-replication-set-up - return c.syncer.StartSyncGTID(gset) +//nolint:unused +func (c *MySqlConnector) startCdcStreamingFilePos( + lastOffsetName string, lastOffsetPos uint32, +) (*replication.BinlogSyncer, *replication.BinlogStreamer, error) { + syncer := c.startSyncer() + stream, err := syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) + return syncer, stream, err } -func (c *MySqlConnector) closeSyncer() { - c.syncer.Close() +func (c *MySqlConnector) startCdcStreamingGtid(gset mysql.GTIDSet) (*replication.BinlogSyncer, *replication.BinlogStreamer, error) { + // https://hevodata.com/learn/mysql-gtids-and-replication-set-up + syncer := c.startSyncer() + stream, err := syncer.StartSyncGTID(gset) + return syncer, stream, err } func (c *MySqlConnector) ReplPing(context.Context) error { @@ -192,11 +209,12 @@ func (c *MySqlConnector) PullRecords( if err != nil { return err } - mystream, err := c.startCdcStreamingGtid(gset) + + syncer, mystream, err := c.startCdcStreamingGtid(gset) if err != nil { return err } - defer c.closeSyncer() + defer syncer.Close() var fetchedBytesCounter metric.Int64Counter if otelManager != nil { diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 56bc9369c1..5c72dc2c46 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -12,7 +12,6 @@ import ( "github.com/go-mysql-org/go-mysql/client" "github.com/go-mysql-org/go-mysql/mysql" - "github.com/go-mysql-org/go-mysql/replication" "go.temporal.io/sdk/log" metadataStore "github.com/PeerDB-io/peerdb/flow/connectors/external_metadata" @@ -25,7 +24,6 @@ type MySqlConnector struct { *metadataStore.PostgresMetadata config *protos.MySqlConfig conn *client.Conn - syncer *replication.BinlogSyncer logger log.Logger replState mysql.GTIDSet } @@ -35,28 +33,14 @@ func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlC if err != nil { return nil, err } - syncer := replication.NewBinlogSyncer(replication.BinlogSyncerConfig{ - ServerID: 1729, // TODO put in config (or generate randomly, which is what go-mysql-org does) - Flavor: config.Flavor, - Host: config.Host, - Port: uint16(config.Port), - User: config.User, - Password: config.Password, - UseDecimal: true, - ParseTime: true, - }) return &MySqlConnector{ PostgresMetadata: pgMetadata, config: config, - syncer: syncer, logger: shared.LoggerFromCtx(ctx), }, nil } func (c *MySqlConnector) Close() error { - if c.syncer != nil { - c.syncer.Close() - } if c.conn != nil { return c.conn.Close() } @@ -92,7 +76,9 @@ func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interf if err != nil { return nil, fmt.Errorf("failed to connect to mysql server: %w", err) } - c.conn.Execute("SET sql_mode = ANSI") + if _, err := c.conn.Execute("SET sql_mode = ANSI"); err != nil { + return nil, fmt.Errorf("failed to set sql_mode to ANSI: %w", err) + } } rs, err := c.conn.Execute(cmd, args...) diff --git a/flow/connectors/postgres/qrep_partition_test.go b/flow/connectors/postgres/qrep_partition_test.go index 0c89b378db..3154fb08e1 100644 --- a/flow/connectors/postgres/qrep_partition_test.go +++ b/flow/connectors/postgres/qrep_partition_test.go @@ -86,8 +86,7 @@ func TestGetQRepPartitions(t *testing.T) { defer conn.Close(context.Background()) //nolint:gosec // Generate a random schema name, number has no cryptographic significance - rndUint := rand.Uint64() - schemaName := fmt.Sprintf("test_%d", rndUint) + schemaName := fmt.Sprintf("test_%d", rand.Uint64()) // Create the schema _, err = conn.Exec(context.Background(), fmt.Sprintf(`CREATE SCHEMA %s;`, schemaName)) From 02648fb09b53aad116ee9b088bea2d5ba3850051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 3 Jan 2025 23:55:06 +0000 Subject: [PATCH 26/80] Support file position streaming. Probably not supposed to mix these modes like this --- flow/activities/flowable_core.go | 1 + flow/connectors/mysql/cdc.go | 55 ++++++++++++++++++++++---------- flow/connectors/mysql/mysql.go | 7 ++-- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/flow/activities/flowable_core.go b/flow/activities/flowable_core.go index 28ee6300d2..ef4df798e8 100644 --- a/flow/activities/flowable_core.go +++ b/flow/activities/flowable_core.go @@ -286,6 +286,7 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon syncDuration := time.Since(syncStartTime) lastCheckpoint := recordBatchSync.GetLastCheckpoint() + logger.Info("batch synced", slog.Any("checkpoint", lastCheckpoint)) if err := srcConn.UpdateReplStateLastOffset(ctx, lastCheckpoint); err != nil { a.Alerter.LogFlowError(ctx, flowName, err) return nil, err diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 99537eb62e..07395445fa 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -6,6 +6,8 @@ import ( "fmt" "log/slog" "math/rand/v2" + "strconv" + "strings" "time" "github.com/go-mysql-org/go-mysql/mysql" @@ -39,7 +41,7 @@ func (c *MySqlConnector) GetTableSchema( return nil, err } res[tableName] = tableSchema - c.logger.Info("fetched schema for table " + tableName) + c.logger.Info("fetched schema for table", slog.String("table", tableName)) } return res, nil @@ -146,12 +148,34 @@ func (c *MySqlConnector) startSyncer() *replication.BinlogSyncer { }) } -//nolint:unused +func (c *MySqlConnector) startStreaming(pos string) (*replication.BinlogSyncer, *replication.BinlogStreamer, error) { + if rest, isFile := strings.CutPrefix(pos, "!f:"); isFile { + comma := strings.LastIndexByte(rest, ',') + if comma == -1 { + return nil, nil, fmt.Errorf("no comma in file/pos offset %s", pos) + } + offset, err := strconv.ParseUint(rest[comma+1:], 16, 32) + if err != nil { + return nil, nil, fmt.Errorf("invalid offset in filepos offset %s: %w", pos, err) + } + return c.startCdcStreamingFilePos(rest[:comma], uint32(offset)) + } else { + gset, err := mysql.ParseGTIDSet(c.config.Flavor, pos) + if err != nil { + return nil, nil, err + } + return c.startCdcStreamingGtid(gset) + } +} + func (c *MySqlConnector) startCdcStreamingFilePos( lastOffsetName string, lastOffsetPos uint32, ) (*replication.BinlogSyncer, *replication.BinlogStreamer, error) { syncer := c.startSyncer() stream, err := syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) + if err != nil { + syncer.Close() + } return syncer, stream, err } @@ -159,6 +183,9 @@ func (c *MySqlConnector) startCdcStreamingGtid(gset mysql.GTIDSet) (*replication // https://hevodata.com/learn/mysql-gtids-and-replication-set-up syncer := c.startSyncer() stream, err := syncer.StartSyncGTID(gset) + if err != nil { + syncer.Close() + } return syncer, stream, err } @@ -167,7 +194,6 @@ func (c *MySqlConnector) ReplPing(context.Context) error { } func (c *MySqlConnector) UpdateReplStateLastOffset(ctx context.Context, lastOffset model.CdcCheckpoint) error { - // TODO assert c.replState == lastOffset flowName := ctx.Value(shared.FlowNameKey).(string) return c.SetLastOffset(ctx, flowName, lastOffset) } @@ -205,12 +231,8 @@ func (c *MySqlConnector) PullRecords( req *model.PullRecordsRequest[model.RecordItems], ) error { defer req.RecordStream.Close() - gset, err := mysql.ParseGTIDSet(c.config.Flavor, req.LastOffset.Text) - if err != nil { - return err - } - syncer, mystream, err := c.startCdcStreamingGtid(gset) + syncer, mystream, err := c.startStreaming(req.LastOffset.Text) if err != nil { return err } @@ -230,7 +252,7 @@ func (c *MySqlConnector) PullRecords( defer cancelTimeout() var recordCount uint32 - for { + for recordCount < req.MaxBatchSize { event, err := mystream.GetEvent(timeoutCtx) if err != nil { if errors.Is(err, context.DeadlineExceeded) { @@ -246,6 +268,8 @@ func (c *MySqlConnector) PullRecords( } switch ev := event.Event.(type) { + case *replication.RotateEvent: + req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%d", string(ev.NextLogName), ev.Position)) case *replication.MariadbGTIDEvent: var err error newset, err := ev.GTIDNext() @@ -253,7 +277,7 @@ func (c *MySqlConnector) PullRecords( // TODO could ignore, but then we might get stuck rereading same batch each time return err } - c.replState = newset + req.RecordStream.UpdateLatestCheckpointText(newset.String()) case *replication.GTIDEvent: var err error newset, err := ev.GTIDNext() @@ -261,7 +285,10 @@ func (c *MySqlConnector) PullRecords( // TODO could ignore, but then we might get stuck rereading same batch each time return err } - c.replState = newset + req.RecordStream.UpdateLatestCheckpointText(newset.String()) + case *replication.PreviousGTIDsEvent: + // TODO is this the correct way to handle this event? + req.RecordStream.UpdateLatestCheckpointText(ev.GTIDSets) case *replication.RowsEvent: sourceTableName := string(ev.Table.Schema) + "." + string(ev.Table.Table) // TODO this is fragile destinationTableName := req.TableNameMapping[sourceTableName].Name @@ -333,14 +360,10 @@ func (c *MySqlConnector) PullRecords( } } default: - continue } } - - if recordCount >= req.MaxBatchSize { - return nil - } } + return nil } func qvalueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) qvalue.QValue { diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 5c72dc2c46..8a7a380134 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -22,10 +22,9 @@ import ( type MySqlConnector struct { *metadataStore.PostgresMetadata - config *protos.MySqlConfig - conn *client.Conn - logger log.Logger - replState mysql.GTIDSet + config *protos.MySqlConfig + conn *client.Conn + logger log.Logger } func NewMySqlConnector(ctx context.Context, config *protos.MySqlConfig) (*MySqlConnector, error) { From 58193d01cb745229dcec557fef1a25eb1bb18efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Sun, 5 Jan 2025 00:38:36 +0000 Subject: [PATCH 27/80] ExecuteSelectStreaming --- flow/connectors/mysql/mysql.go | 60 +++++++++++++++++++++++++++ flow/connectors/mysql/qrep.go | 75 ++++++++++++++++++---------------- 2 files changed, 100 insertions(+), 35 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 8a7a380134..31a2742b47 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -94,6 +94,66 @@ func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interf } } +func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, result *mysql.Result, + rowCb client.SelectPerRowCallback, + resultCb client.SelectPerResultCallback, + args ...interface{}, +) error { + reconnects := 3 + for { + // TODO need new connection if ctx changes between calls, or make upstream PR + if c.conn == nil { + var err error + var argF []client.Option + if !c.config.DisableTls { + argF = append(argF, func(conn *client.Conn) error { + conn.SetTLSConfig(&tls.Config{MinVersion: tls.VersionTLS13}) + return nil + }) + } + c.conn, err = c.connect(ctx, argF...) + if err != nil { + return fmt.Errorf("failed to connect to mysql server: %w", err) + } + if _, err := c.conn.Execute("SET sql_mode = ANSI"); err != nil { + return fmt.Errorf("failed to set sql_mode to ANSI: %w", err) + } + } + + if len(args) == 0 { + if err := c.conn.ExecuteSelectStreaming(cmd, result, rowCb, resultCb); err != nil { + if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { + reconnects -= 1 + c.conn.Close() + c.conn = nil + continue + } + return err + } + } else { + stmt, err := c.conn.Prepare(cmd) + if err != nil { + if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { + reconnects -= 1 + c.conn.Close() + c.conn = nil + continue + } + return err + } + if err := stmt.ExecuteSelectStreaming(result, rowCb, resultCb, args...); err != nil { + if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { + reconnects -= 1 + c.conn.Close() + c.conn = nil + continue + } + return err + } + } + } +} + func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, error) { showBinlogStatus := "SHOW BINARY LOG STATUS" if eq, err := c.conn.CompareServerVersion("8.4.0"); (err == nil) && (eq < 0) { diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 319c2c7a0c..879a19f02f 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -152,12 +152,45 @@ func (c *MySqlConnector) PullQRepRecords( return 0, err } - var rs *mysql.Result + totalRecords := 0 + onResult := func(rs *mysql.Result) error { + schema := make([]qvalue.QField, 0, len(rs.Fields)) + for _, field := range rs.Fields { + qkind, err := qkindFromMysql(field.Type) + if err != nil { + return err + } + + schema = append(schema, qvalue.QField{ + Name: string(field.Name), + Type: qkind, + Precision: 0, // TODO numerics + Scale: 0, // TODO numerics + Nullable: (field.Flag & mysql.NOT_NULL_FLAG) == 0, + }) + } + stream.SetSchema(qvalue.QRecordSchema{Fields: schema}) + return nil + } + onRow := func(row []mysql.FieldValue) error { + totalRecords += 1 // TODO can this be batched in onResult or by checking rs at end? + schema := stream.Schema() + record := make([]qvalue.QValue, 0, len(row)) + for idx, val := range row { + qv, err := qvalueFromMysqlFieldValue(schema.Fields[idx].Type, val) + if err != nil { + return err + } + record = append(record, qv) + } + stream.Records <- record + return nil + } + if last.FullTablePartition { - var err error // this is a full table partition, so just run the query - rs, err = c.Execute(ctx, query) - if err != nil { + var rs mysql.Result + if err := c.ExecuteSelectStreaming(ctx, query, &rs, onRow, onResult); err != nil { return 0, err } } else { @@ -176,42 +209,14 @@ func (c *MySqlConnector) PullQRepRecords( return 0, fmt.Errorf("unknown range type: %v", x) } - var err error - rs, err = c.Execute(ctx, query, rangeStart, rangeEnd) - if err != nil { + var rs mysql.Result + if err := c.ExecuteSelectStreaming(ctx, query, &rs, onRow, onResult, rangeStart, rangeEnd); err != nil { return 0, err } } - schema := make([]qvalue.QField, 0, len(rs.Fields)) - for _, field := range rs.Fields { - qkind, err := qkindFromMysql(field.Type) - if err != nil { - return 0, err - } - - schema = append(schema, qvalue.QField{ - Name: string(field.Name), - Type: qkind, - Precision: 0, // TODO numerics - Scale: 0, // TODO numerics - Nullable: (field.Flag & mysql.NOT_NULL_FLAG) == 0, - }) - } - stream.SetSchema(qvalue.QRecordSchema{Fields: schema}) - for _, row := range rs.Values { - record := make([]qvalue.QValue, 0, len(row)) - for idx, val := range row { - qv, err := qvalueFromMysqlFieldValue(schema[idx].Type, val) - if err != nil { - return 0, err - } - record = append(record, qv) - } - stream.Records <- record - } close(stream.Records) - return len(rs.Values), nil + return totalRecords, nil } func BuildQuery(logger log.Logger, query string) (string, error) { From 4a67a781bc6b0199ef522ebdd13a401c6bae48de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Mon, 6 Jan 2025 20:34:40 +0000 Subject: [PATCH 28/80] Support gtid_mode not ON --- flow/connectors/mysql/cdc.go | 20 +++++++++++++++++--- flow/connectors/mysql/mysql.go | 14 ++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 07395445fa..33517dcf2f 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -121,12 +121,26 @@ func (c *MySqlConnector) SetupReplConn(ctx context.Context) error { return fmt.Errorf("[mysql] SetupReplConn failed to GetLastOffset: %w", err) } if offset.Text == "" { - set, err := c.GetMasterGTIDSet(ctx) + gtidModeOn, err := c.GetGtidModeOn(ctx) if err != nil { - return fmt.Errorf("[mysql] SetupReplConn failed to GetMasterGTIDSet: %w", err) + return err + } + var lastOffsetText string + if gtidModeOn { + set, err := c.GetMasterGTIDSet(ctx) + if err != nil { + return fmt.Errorf("[mysql] SetupReplConn failed to GetMasterGTIDSet: %w", err) + } + lastOffsetText = set.String() + } else { + pos, err := c.GetMasterPos(ctx) + if err != nil { + return fmt.Errorf("[mysql] SetupReplConn failed to GetMasterPos: %w", err) + } + lastOffsetText = fmt.Sprintf("!f:%s,%d", pos.Name, pos.Pos) } if err := c.SetLastOffset( - ctx, flowName, model.CdcCheckpoint{Text: set.String()}, + ctx, flowName, model.CdcCheckpoint{Text: lastOffsetText}, ); err != nil { return fmt.Errorf("[mysql] SetupReplConn failed to SetLastOffset: %w", err) } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 31a2742b47..3401dc2e21 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -154,6 +154,20 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, } } +func (c *MySqlConnector) GetGtidModeOn(ctx context.Context) (bool, error) { + rr, err := c.Execute(ctx, "select @@global.gtid_mode") + if err != nil { + return false, err + } + + gtid_mode, err := rr.GetString(0, 0) + if err != nil { + return false, err + } + + return gtid_mode == "ON", nil +} + func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, error) { showBinlogStatus := "SHOW BINARY LOG STATUS" if eq, err := c.conn.CompareServerVersion("8.4.0"); (err == nil) && (eq < 0) { From 9d6d5134b183efc094638b3961f4a84f485ffee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 7 Jan 2025 20:54:34 +0000 Subject: [PATCH 29/80] e2e wip --- flow/e2e/clickhouse/peer_flow_ch_test.go | 6 +- flow/e2e/congen.go | 162 ++--------------------- flow/e2e/mysql/mysql.go | 71 ++++++++++ flow/e2e/pg.go | 157 ++++++++++++++++++++++ flow/e2e/test_utils.go | 20 +-- 5 files changed, 251 insertions(+), 165 deletions(-) create mode 100644 flow/e2e/mysql/mysql.go create mode 100644 flow/e2e/pg.go diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index a670b5ee33..283264b8df 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -26,7 +26,11 @@ import ( //go:embed test_data/* var testData embed.FS -func TestPeerFlowE2ETestSuiteCH(t *testing.T) { +func TestPeerFlowE2ETestSuitePG_CH(t *testing.T) { + e2eshared.RunSuite(t, SetupSuite) +} + +func TestPeerFlowE2ETestSuiteMySQL_CH(t *testing.T) { e2eshared.RunSuite(t, SetupSuite) } diff --git a/flow/e2e/congen.go b/flow/e2e/congen.go index 110220cfe4..80d84315db 100644 --- a/flow/e2e/congen.go +++ b/flow/e2e/congen.go @@ -2,169 +2,15 @@ package e2e import ( "context" - "errors" - "fmt" "testing" - "time" - "github.com/jackc/pgx/v5" "github.com/stretchr/testify/require" - connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/connectors/utils" "github.com/PeerDB-io/peerdb/flow/generated/protos" "github.com/PeerDB-io/peerdb/flow/peerdbenv" ) -func cleanPostgres(conn *pgx.Conn, suffix string) error { - // drop the e2e_test schema with the given suffix if it exists - if _, err := conn.Exec(context.Background(), fmt.Sprintf("DROP SCHEMA IF EXISTS e2e_test_%s CASCADE", suffix)); err != nil { - return fmt.Errorf("failed to drop e2e_test schema: %w", err) - } - - // drop all open slots with the given suffix - if _, err := conn.Exec( - context.Background(), - "SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots WHERE slot_name LIKE $1", - "%_"+suffix, - ); err != nil { - return fmt.Errorf("failed to drop replication slots: %w", err) - } - - // list all publications from pg_publication table - rows, err := conn.Query(context.Background(), - "SELECT pubname FROM pg_publication WHERE pubname LIKE $1", - "%_"+suffix, - ) - if err != nil { - return fmt.Errorf("failed to list publications: %w", err) - } - publications, err := pgx.CollectRows[string](rows, pgx.RowTo) - if err != nil { - return fmt.Errorf("failed to read publications: %w", err) - } - - for _, pubName := range publications { - if _, err := conn.Exec(context.Background(), "DROP PUBLICATION "+pubName); err != nil { - return fmt.Errorf("failed to drop publication %s: %w", pubName, err) - } - } - - return nil -} - -func setupPostgresSchema(t *testing.T, conn *pgx.Conn, suffix string) error { - t.Helper() - - setupTx, err := conn.Begin(context.Background()) - if err != nil { - return errors.New("failed to start setup transaction") - } - - // create an e2e_test schema - if _, err := setupTx.Exec(context.Background(), "SELECT pg_advisory_xact_lock(hashtext('Megaton Mile'))"); err != nil { - return fmt.Errorf("failed to get lock: %w", err) - } - defer func() { - deferErr := setupTx.Rollback(context.Background()) - if deferErr != pgx.ErrTxClosed && deferErr != nil { - t.Errorf("error rolling back setup transaction: %v", err) - } - }() - - // create an e2e_test schema - if _, err := setupTx.Exec(context.Background(), "CREATE SCHEMA e2e_test_"+suffix); err != nil { - return fmt.Errorf("failed to create e2e_test schema: %w", err) - } - - if _, err := setupTx.Exec(context.Background(), ` - CREATE OR REPLACE FUNCTION random_string( int ) RETURNS TEXT as $$ - SELECT string_agg(substring('0123456789bcdfghjkmnpqrstvwxyz', - round(random() * 30)::integer, 1), '') FROM generate_series(1, $1); - $$ language sql; - CREATE OR REPLACE FUNCTION random_bytea(bytea_length integer) - RETURNS bytea AS $body$ - SELECT decode(string_agg(lpad(to_hex(width_bucket(random(), 0, 1, 256)-1),2,'0'), ''), 'hex') - FROM generate_series(1, $1); - $body$ - LANGUAGE 'sql' - VOLATILE - SET search_path = 'pg_catalog'; - `); err != nil { - return fmt.Errorf("failed to create utility functions: %w", err) - } - - return setupTx.Commit(context.Background()) -} - -// SetupPostgres sets up the postgres connection. -func SetupPostgres(t *testing.T, suffix string) (*connpostgres.PostgresConnector, error) { - t.Helper() - - connector, err := connpostgres.NewPostgresConnector(context.Background(), - nil, peerdbenv.GetCatalogPostgresConfigFromEnv(context.Background())) - if err != nil { - return nil, fmt.Errorf("failed to create postgres connection: %w", err) - } - conn := connector.Conn() - - if err := cleanPostgres(conn, suffix); err != nil { - connector.Close() - return nil, err - } - - if err := setupPostgresSchema(t, conn, suffix); err != nil { - connector.Close() - return nil, err - } - - return connector, nil -} - -func TearDownPostgres[T Suite](s T) { - t := s.T() - t.Helper() - - conn := s.Connector().Conn() - if conn != nil { - suffix := s.Suffix() - t.Log("begin tearing down postgres schema", suffix) - deadline := time.Now().Add(2 * time.Minute) - for { - err := cleanPostgres(conn, suffix) - if err == nil { - conn.Close(context.Background()) - return - } else if time.Now().After(deadline) { - require.Fail(t, "failed to teardown postgres schema", "%s: %v", suffix, err) - } - time.Sleep(time.Second) - } - } -} - -// GeneratePostgresPeer generates a postgres peer config for testing. -func GeneratePostgresPeer(t *testing.T) *protos.Peer { - t.Helper() - peer := &protos.Peer{ - Name: "catalog", - Type: protos.DBType_POSTGRES, - Config: &protos.Peer_PostgresConfig{ - PostgresConfig: peerdbenv.GetCatalogPostgresConfigFromEnv(context.Background()), - }, - } - CreatePeer(t, peer) - return peer -} - -type FlowConnectionGenerationConfig struct { - FlowJobName string - TableNameMapping map[string]string - Destination string - TableMappings []*protos.TableMapping - SoftDelete bool -} - func TableMappings(s GenericSuite, tables ...string) []*protos.TableMapping { if len(tables)&1 != 0 { panic("must receive even number of table names") @@ -191,6 +37,14 @@ func CreatePeer(t *testing.T, peer *protos.Peer) { } } +type FlowConnectionGenerationConfig struct { + FlowJobName string + TableNameMapping map[string]string + Destination string + TableMappings []*protos.TableMapping + SoftDelete bool +} + func (c *FlowConnectionGenerationConfig) GenerateFlowConnectionConfigs(t *testing.T) *protos.FlowConnectionConfigs { t.Helper() tblMappings := c.TableMappings diff --git a/flow/e2e/mysql/mysql.go b/flow/e2e/mysql/mysql.go new file mode 100644 index 0000000000..a2abfc8d9f --- /dev/null +++ b/flow/e2e/mysql/mysql.go @@ -0,0 +1,71 @@ +package e2e_postgres + +import ( + "context" + "fmt" + "strings" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/PeerDB-io/peer-flow/connectors" + "github.com/PeerDB-io/peer-flow/connectors/mysql" + "github.com/PeerDB-io/peer-flow/e2e" + "github.com/PeerDB-io/peer-flow/generated/protos" + "github.com/PeerDB-io/peer-flow/model" + "github.com/PeerDB-io/peer-flow/shared" +) + +type PeerFlowE2ETestSuiteMySQL struct { + t *testing.T + + conn *connmysql.MySqlConnector + suffix string +} + +func (s PeerFlowE2ETestSuiteMySQL) T() *testing.T { + return s.t +} + +func (s PeerFlowE2ETestSuiteMySQL) Connector() *connmysql.MySqlConnector { + return s.conn +} + +func (s PeerFlowE2ETestSuiteMySQL) DestinationConnector() connectors.Connector { + return s.conn +} + +func (s PeerFlowE2ETestSuiteMySQL) Suffix() string { + return s.suffix +} + +func (s PeerFlowE2ETestSuiteMySQL) Peer() *protos.Peer { + return e2e.GeneratePostgresPeer(s.t) +} + +func (s PeerFlowE2ETestSuiteMySQL) DestinationTable(table string) string { + return e2e.AttachSchema(s, table) +} + +func (s PeerFlowE2ETestSuiteMySQL) GetRows(table string, cols string) (*model.QRecordBatch, error) { + s.t.Helper() + panic("TODO") +} + +func SetupSuite(t *testing.T) PeerFlowE2ETestSuiteMySQL { + t.Helper() + + suffix := "pg_" + strings.ToLower(shared.RandomString(8)) + conn, err := e2e.SetupPostgres(t, suffix) + require.NoError(t, err, "failed to setup postgres") + + return PeerFlowE2ETestSuiteMySQL{ + t: t, + conn: conn, + suffix: suffix, + } +} + +func (s PeerFlowE2ETestSuiteMySQL) Teardown() { + // TODO for mysql +} diff --git a/flow/e2e/pg.go b/flow/e2e/pg.go new file mode 100644 index 0000000000..38a3b3eeb5 --- /dev/null +++ b/flow/e2e/pg.go @@ -0,0 +1,157 @@ +package e2e + +import ( + "context" + "errors" + "fmt" + "testing" + "time" + + "github.com/jackc/pgx/v5" + "github.com/stretchr/testify/require" + + connpostgres "github.com/PeerDB-io/peer-flow/connectors/postgres" + "github.com/PeerDB-io/peer-flow/generated/protos" + "github.com/PeerDB-io/peer-flow/peerdbenv" +) + +func cleanPostgres(conn *pgx.Conn, suffix string) error { + // drop the e2e_test schema with the given suffix if it exists + if _, err := conn.Exec(context.Background(), fmt.Sprintf("DROP SCHEMA IF EXISTS e2e_test_%s CASCADE", suffix)); err != nil { + return fmt.Errorf("failed to drop e2e_test schema: %w", err) + } + + // drop all open slots with the given suffix + if _, err := conn.Exec( + context.Background(), + "SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots WHERE slot_name LIKE $1", + "%_"+suffix, + ); err != nil { + return fmt.Errorf("failed to drop replication slots: %w", err) + } + + // list all publications from pg_publication table + rows, err := conn.Query(context.Background(), + "SELECT pubname FROM pg_publication WHERE pubname LIKE $1", + "%_"+suffix, + ) + if err != nil { + return fmt.Errorf("failed to list publications: %w", err) + } + publications, err := pgx.CollectRows[string](rows, pgx.RowTo) + if err != nil { + return fmt.Errorf("failed to read publications: %w", err) + } + + for _, pubName := range publications { + if _, err := conn.Exec(context.Background(), "DROP PUBLICATION "+pubName); err != nil { + return fmt.Errorf("failed to drop publication %s: %w", pubName, err) + } + } + + return nil +} + +func setupPostgresSchema(t *testing.T, conn *pgx.Conn, suffix string) error { + t.Helper() + + setupTx, err := conn.Begin(context.Background()) + if err != nil { + return errors.New("failed to start setup transaction") + } + + // create an e2e_test schema + if _, err := setupTx.Exec(context.Background(), "SELECT pg_advisory_xact_lock(hashtext('Megaton Mile'))"); err != nil { + return fmt.Errorf("failed to get lock: %w", err) + } + defer func() { + deferErr := setupTx.Rollback(context.Background()) + if deferErr != pgx.ErrTxClosed && deferErr != nil { + t.Errorf("error rolling back setup transaction: %v", err) + } + }() + + // create an e2e_test schema + if _, err := setupTx.Exec(context.Background(), "CREATE SCHEMA e2e_test_"+suffix); err != nil { + return fmt.Errorf("failed to create e2e_test schema: %w", err) + } + + if _, err := setupTx.Exec(context.Background(), ` + CREATE OR REPLACE FUNCTION random_string( int ) RETURNS TEXT as $$ + SELECT string_agg(substring('0123456789bcdfghjkmnpqrstvwxyz', + round(random() * 30)::integer, 1), '') FROM generate_series(1, $1); + $$ language sql; + CREATE OR REPLACE FUNCTION random_bytea(bytea_length integer) + RETURNS bytea AS $body$ + SELECT decode(string_agg(lpad(to_hex(width_bucket(random(), 0, 1, 256)-1),2,'0'), ''), 'hex') + FROM generate_series(1, $1); + $body$ + LANGUAGE 'sql' + VOLATILE + SET search_path = 'pg_catalog'; + `); err != nil { + return fmt.Errorf("failed to create utility functions: %w", err) + } + + return setupTx.Commit(context.Background()) +} + +// SetupPostgres sets up the postgres connection. +func SetupPostgres(t *testing.T, suffix string) (*connpostgres.PostgresConnector, error) { + t.Helper() + + connector, err := connpostgres.NewPostgresConnector(context.Background(), + nil, peerdbenv.GetCatalogPostgresConfigFromEnv(context.Background())) + if err != nil { + return nil, fmt.Errorf("failed to create postgres connection: %w", err) + } + conn := connector.Conn() + + if err := cleanPostgres(conn, suffix); err != nil { + connector.Close() + return nil, err + } + + if err := setupPostgresSchema(t, conn, suffix); err != nil { + connector.Close() + return nil, err + } + + return connector, nil +} + +func TearDownPostgres[T Suite](s T) { + t := s.T() + t.Helper() + + conn := s.Connector().Conn() + if conn != nil { + suffix := s.Suffix() + t.Log("begin tearing down postgres schema", suffix) + deadline := time.Now().Add(2 * time.Minute) + for { + err := cleanPostgres(conn, suffix) + if err == nil { + conn.Close(context.Background()) + return + } else if time.Now().After(deadline) { + require.Fail(t, "failed to teardown postgres schema", "%s: %v", suffix, err) + } + time.Sleep(time.Second) + } + } +} + +// GeneratePostgresPeer generates a postgres peer config for testing. +func GeneratePostgresPeer(t *testing.T) *protos.Peer { + t.Helper() + peer := &protos.Peer{ + Name: "catalog", + Type: protos.DBType_POSTGRES, + Config: &protos.Peer_PostgresConfig{ + PostgresConfig: peerdbenv.GetCatalogPostgresConfigFromEnv(context.Background()), + }, + } + CreatePeer(t, peer) + return peer +} diff --git a/flow/e2e/test_utils.go b/flow/e2e/test_utils.go index 37705e0675..ad1109cea8 100644 --- a/flow/e2e/test_utils.go +++ b/flow/e2e/test_utils.go @@ -39,30 +39,30 @@ func init() { _ = godotenv.Load() } -type Suite interface { +type Suite[TSource connectors.Connector] interface { e2eshared.Suite T() *testing.T - Connector() *connpostgres.PostgresConnector + Connector() TSource Suffix() string } -type RowSource interface { - Suite +type RowSource[TSource connectors.Connector] interface { + Suite[TSource] GetRows(table, cols string) (*model.QRecordBatch, error) } -type GenericSuite interface { - RowSource +type GenericSuite[TSource connectors.Connector] interface { + RowSource[TSource] Peer() *protos.Peer DestinationConnector() connectors.Connector DestinationTable(table string) string } -func AttachSchema(s Suite, table string) string { +func AttachSchema(s interface{ Suffix() string }, table string) string { return fmt.Sprintf("e2e_test_%s.%s", s.Suffix(), table) } -func AddSuffix(s Suite, str string) string { +func AddSuffix[T connectors.Connector](s Suite[T], str string) string { return fmt.Sprintf("%s_%s", str, s.Suffix()) } @@ -542,7 +542,7 @@ func GetOwnersSelectorStringsSF() [2]string { return [2]string{strings.Join(pgFields, ","), strings.Join(sfFields, ",")} } -func ExpectedDestinationIdentifier(s GenericSuite, ident string) string { +func ExpectedDestinationIdentifier[T connectors.Connector](s GenericSuite[T], ident string) string { switch s.DestinationConnector().(type) { case *connsnowflake.SnowflakeConnector: return strings.ToUpper(ident) @@ -551,7 +551,7 @@ func ExpectedDestinationIdentifier(s GenericSuite, ident string) string { } } -func ExpectedDestinationTableName(s GenericSuite, table string) string { +func ExpectedDestinationTableName[T connectors.Connector](s GenericSuite[T], table string) string { return ExpectedDestinationIdentifier(s, s.DestinationTable(table)) } From 103a3dbcfc4320efcce089d57ac8b9cc88bdf509 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 7 Jan 2025 21:02:14 +0000 Subject: [PATCH 30/80] import renames --- flow/connectors/mysql/qrep.go | 8 ++++---- flow/e2e/mysql/mysql.go | 12 ++++++------ flow/e2e/pg.go | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 879a19f02f..619d9ae83b 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -12,10 +12,10 @@ import ( "github.com/google/uuid" "go.temporal.io/sdk/log" - utils "github.com/PeerDB-io/peer-flow/connectors/utils/partition" - "github.com/PeerDB-io/peer-flow/generated/protos" - "github.com/PeerDB-io/peer-flow/model" - "github.com/PeerDB-io/peer-flow/model/qvalue" + utils "github.com/PeerDB-io/peerdb/flow/connectors/utils/partition" + "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/model" + "github.com/PeerDB-io/peerdb/flow/model/qvalue" ) func (c *MySqlConnector) GetQRepPartitions( diff --git a/flow/e2e/mysql/mysql.go b/flow/e2e/mysql/mysql.go index a2abfc8d9f..fe5dc8d838 100644 --- a/flow/e2e/mysql/mysql.go +++ b/flow/e2e/mysql/mysql.go @@ -8,12 +8,12 @@ import ( "github.com/stretchr/testify/require" - "github.com/PeerDB-io/peer-flow/connectors" - "github.com/PeerDB-io/peer-flow/connectors/mysql" - "github.com/PeerDB-io/peer-flow/e2e" - "github.com/PeerDB-io/peer-flow/generated/protos" - "github.com/PeerDB-io/peer-flow/model" - "github.com/PeerDB-io/peer-flow/shared" + "github.com/PeerDB-io/peerdb/flow/connectors" + "github.com/PeerDB-io/peerdb/flow/connectors/mysql" + "github.com/PeerDB-io/peerdb/flow/e2e" + "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/model" + "github.com/PeerDB-io/peerdb/flow/shared" ) type PeerFlowE2ETestSuiteMySQL struct { diff --git a/flow/e2e/pg.go b/flow/e2e/pg.go index 38a3b3eeb5..7acdbb4aab 100644 --- a/flow/e2e/pg.go +++ b/flow/e2e/pg.go @@ -10,9 +10,9 @@ import ( "github.com/jackc/pgx/v5" "github.com/stretchr/testify/require" - connpostgres "github.com/PeerDB-io/peer-flow/connectors/postgres" - "github.com/PeerDB-io/peer-flow/generated/protos" - "github.com/PeerDB-io/peer-flow/peerdbenv" + connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" + "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/peerdbenv" ) func cleanPostgres(conn *pgx.Conn, suffix string) error { From 92e27e71b6df582e5977cf3a7d32e175b911cf12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 8 Jan 2025 15:53:27 +0000 Subject: [PATCH 31/80] fix lints --- flow/connectors/mysql/qrep.go | 5 +- flow/e2e/bigquery/bigquery.go | 2 +- flow/e2e/bigquery/peer_flow_bq_test.go | 36 +++++----- flow/e2e/clickhouse/clickhouse.go | 46 ++++++------ flow/e2e/clickhouse/peer_flow_ch_test.go | 28 ++++---- flow/e2e/congen.go | 42 ++++++++++- flow/e2e/elasticsearch/elasticsearch.go | 2 +- flow/e2e/elasticsearch/peer_flow_es_test.go | 4 +- flow/e2e/eventhub/peer_flow_eh_test.go | 4 +- flow/e2e/generic/generic_test.go | 13 ++-- flow/e2e/kafka/kafka_test.go | 10 +-- flow/e2e/mysql/mysql.go | 71 ------------------- flow/e2e/pg.go | 50 ++++++++++--- flow/e2e/postgres/peer_flow_pg_test.go | 31 ++++---- flow/e2e/postgres/postgres.go | 2 +- flow/e2e/pubsub/pubsub_test.go | 8 +-- flow/e2e/s3/cdc_s3_test.go | 2 +- flow/e2e/s3/qrep_flow_s3_test.go | 2 +- flow/e2e/snowflake/peer_flow_sf_test.go | 26 +++---- flow/e2e/snowflake/snowflake.go | 2 +- .../e2e/sqlserver/qrep_flow_sqlserver_test.go | 2 +- flow/e2e/test_utils.go | 49 +++++++++---- 22 files changed, 236 insertions(+), 201 deletions(-) delete mode 100644 flow/e2e/mysql/mysql.go diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 619d9ae83b..292ffad786 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -174,7 +174,10 @@ func (c *MySqlConnector) PullQRepRecords( } onRow := func(row []mysql.FieldValue) error { totalRecords += 1 // TODO can this be batched in onResult or by checking rs at end? - schema := stream.Schema() + schema, err := stream.Schema() + if err != nil { + return err + } record := make([]qvalue.QValue, 0, len(row)) for idx, val := range row { qv, err := qvalueFromMysqlFieldValue(schema.Fields[idx].Type, val) diff --git a/flow/e2e/bigquery/bigquery.go b/flow/e2e/bigquery/bigquery.go index 0acb08dee9..98d8e79b94 100644 --- a/flow/e2e/bigquery/bigquery.go +++ b/flow/e2e/bigquery/bigquery.go @@ -110,7 +110,7 @@ func SetupSuite(t *testing.T) PeerFlowE2ETestSuiteBQ { return PeerFlowE2ETestSuiteBQ{ t: t, bqSuffix: bqSuffix, - conn: conn, + conn: conn.PostgresConnector, bqHelper: bqHelper, } } diff --git a/flow/e2e/bigquery/peer_flow_bq_test.go b/flow/e2e/bigquery/peer_flow_bq_test.go index 8fae1c40f4..a0a7d558c7 100644 --- a/flow/e2e/bigquery/peer_flow_bq_test.go +++ b/flow/e2e/bigquery/peer_flow_bq_test.go @@ -119,7 +119,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Complete_Flow_No_Data() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 1 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -150,7 +150,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Char_ColType_Error() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 1 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -182,7 +182,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Toast_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -233,7 +233,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Toast_Advance_1_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -289,7 +289,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Toast_Advance_2_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -339,7 +339,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Toast_Advance_3_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -395,7 +395,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Types_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -477,7 +477,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_NaN_Doubles_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -529,7 +529,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Invalid_Geo_BQ_Avro_CDC() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -605,7 +605,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Multi_Table_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -659,7 +659,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Simple_Schema_Changes_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -737,7 +737,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_All_Types_Schema_Changes_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -805,7 +805,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Composite_PKey_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -861,7 +861,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Composite_PKey_Toast_1_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 flowConnConfig.SoftDeleteColName = "" flowConnConfig.SyncedAtColName = "" @@ -921,7 +921,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Composite_PKey_Toast_2_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -972,7 +972,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Columns_BQ() { SoftDelete: true, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -1022,7 +1022,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Multi_Table_Multi_Dataset_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -1362,7 +1362,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_JSON_PKey_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 flowConnConfig.SoftDeleteColName = "" flowConnConfig.SyncedAtColName = "" diff --git a/flow/e2e/clickhouse/clickhouse.go b/flow/e2e/clickhouse/clickhouse.go index d45df68730..1ce59cb986 100644 --- a/flow/e2e/clickhouse/clickhouse.go +++ b/flow/e2e/clickhouse/clickhouse.go @@ -25,7 +25,7 @@ import ( type ClickHouseSuite struct { t *testing.T - conn *connpostgres.PostgresConnector + source e2e.SuiteSource s3Helper *e2e_s3.S3TestHelper suffix string } @@ -35,7 +35,7 @@ func (s ClickHouseSuite) T() *testing.T { } func (s ClickHouseSuite) Connector() *connpostgres.PostgresConnector { - return s.conn + return s.source.Connector().(*connpostgres.PostgresConnector) } func (s ClickHouseSuite) DestinationConnector() connectors.Connector { @@ -88,7 +88,7 @@ func (s ClickHouseSuite) DestinationTable(table string) string { func (s ClickHouseSuite) Teardown() { require.NoError(s.t, s.s3Helper.CleanUp(context.Background())) - e2e.TearDownPostgres(s) + s.source.Teardown(s.t, s.Suffix()) } func (s ClickHouseSuite) GetRows(table string, cols string) (*model.QRecordBatch, error) { @@ -216,27 +216,33 @@ func (s ClickHouseSuite) GetRows(table string, cols string) (*model.QRecordBatch return batch, rows.Err() } -func SetupSuite(t *testing.T) ClickHouseSuite { +func SetupSuite[TSource e2e.SuiteSource]( + t *testing.T, + setupSource func(*testing.T, string) (TSource, error), +) func(*testing.T) ClickHouseSuite { t.Helper() + return func(t *testing.T) ClickHouseSuite { + t.Helper() - suffix := "ch_" + strings.ToLower(shared.RandomString(8)) - conn, err := e2e.SetupPostgres(t, suffix) - require.NoError(t, err, "failed to setup postgres") + suffix := "ch_" + strings.ToLower(shared.RandomString(8)) + source, err := setupSource(t, suffix) + require.NoError(t, err, "failed to setup postgres") - s3Helper, err := e2e_s3.NewS3TestHelper(e2e_s3.Minio) - require.NoError(t, err, "failed to setup S3") + s3Helper, err := e2e_s3.NewS3TestHelper(e2e_s3.Minio) + require.NoError(t, err, "failed to setup S3") - s := ClickHouseSuite{ - t: t, - conn: conn, - suffix: suffix, - s3Helper: s3Helper, - } + s := ClickHouseSuite{ + t: t, + source: e2e.SuiteSource(source), + suffix: suffix, + s3Helper: s3Helper, + } - ch, err := connclickhouse.Connect(context.Background(), nil, s.PeerForDatabase("default").GetClickhouseConfig()) - require.NoError(t, err, "failed to connect to clickhouse") - err = ch.Exec(context.Background(), "CREATE DATABASE e2e_test_"+suffix) - require.NoError(t, err, "failed to create clickhouse database") + ch, err := connclickhouse.Connect(context.Background(), nil, s.PeerForDatabase("default").GetClickhouseConfig()) + require.NoError(t, err, "failed to connect to clickhouse") + err = ch.Exec(context.Background(), "CREATE DATABASE e2e_test_"+suffix) + require.NoError(t, err, "failed to create clickhouse database") - return s + return s + } } diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index 283264b8df..68700c3d65 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -27,11 +27,11 @@ import ( var testData embed.FS func TestPeerFlowE2ETestSuitePG_CH(t *testing.T) { - e2eshared.RunSuite(t, SetupSuite) + e2eshared.RunSuite(t, SetupSuite(t, e2e.SetupPostgres)) } func TestPeerFlowE2ETestSuiteMySQL_CH(t *testing.T) { - e2eshared.RunSuite(t, SetupSuite) + // TODO e2eshared.RunSuite(t, SetupSuite(t, e2e.SetupMySQL)) } func (s ClickHouseSuite) attachSchemaSuffix(tableName string) string { @@ -72,7 +72,7 @@ func (s ClickHouseSuite) Test_Addition_Removal() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.MaxBatchSize = 1 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -218,7 +218,7 @@ func (s ClickHouseSuite) Test_NullableMirrorSetting() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_NULLABLE": "true"} @@ -265,7 +265,7 @@ func (s ClickHouseSuite) Test_NullableColumnSetting() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true for _, tm := range flowConnConfig.TableMappings { tm.Columns = []*protos.ColumnSetting{ @@ -317,7 +317,7 @@ func (s ClickHouseSuite) Test_Date32() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) @@ -360,7 +360,7 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Disabled() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_CLICKHOUSE_ENABLE_PRIMARY_UPDATE": "false"} @@ -408,7 +408,7 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Enabled() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_CLICKHOUSE_ENABLE_PRIMARY_UPDATE": "true"} @@ -448,7 +448,7 @@ func (s ClickHouseSuite) Test_Replident_Full_Unchanged_TOAST_Updates() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -493,7 +493,7 @@ func (s ClickHouseSuite) WeirdTable(tableName string) { TableNameMapping: map[string]string{s.attachSchemaSuffix(tableName): dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -577,7 +577,7 @@ func (s ClickHouseSuite) Test_Large_Numeric() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) @@ -633,7 +633,7 @@ func (s ClickHouseSuite) testNumericFF(ffValue bool) { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_CLICKHOUSE_UNBOUNDED_NUMERIC_AS_STRING": strconv.FormatBool(ffValue)} tc := e2e.NewTemporalClient(s.t) @@ -697,7 +697,7 @@ func (s ClickHouseSuite) testBinaryFormat(format string, expected string) { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_CLICKHOUSE_BINARY_FORMAT": format} tc := e2e.NewTemporalClient(s.t) @@ -779,7 +779,7 @@ func (s ClickHouseSuite) Test_Types_CH() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) diff --git a/flow/e2e/congen.go b/flow/e2e/congen.go index 80d84315db..494de06521 100644 --- a/flow/e2e/congen.go +++ b/flow/e2e/congen.go @@ -6,12 +6,19 @@ import ( "github.com/stretchr/testify/require" + "github.com/PeerDB-io/peerdb/flow/connectors" "github.com/PeerDB-io/peerdb/flow/connectors/utils" "github.com/PeerDB-io/peerdb/flow/generated/protos" "github.com/PeerDB-io/peerdb/flow/peerdbenv" ) -func TableMappings(s GenericSuite, tables ...string) []*protos.TableMapping { +type SuiteSource interface { + Teardown(t *testing.T, suffix string) + GeneratePeer(t *testing.T) *protos.Peer + Connector() connectors.Connector +} + +func TableMappings[TSource connectors.Connector](s GenericSuite[TSource], tables ...string) []*protos.TableMapping { if len(tables)&1 != 0 { panic("must receive even number of table names") } @@ -45,7 +52,38 @@ type FlowConnectionGenerationConfig struct { SoftDelete bool } -func (c *FlowConnectionGenerationConfig) GenerateFlowConnectionConfigs(t *testing.T) *protos.FlowConnectionConfigs { +func (c *FlowConnectionGenerationConfig) GenerateFlowConnectionConfigs( + t *testing.T, + source SuiteSource, +) *protos.FlowConnectionConfigs { + t.Helper() + tblMappings := c.TableMappings + if tblMappings == nil { + for k, v := range c.TableNameMapping { + tblMappings = append(tblMappings, &protos.TableMapping{ + SourceTableIdentifier: k, + DestinationTableIdentifier: v, + }) + } + } + + ret := &protos.FlowConnectionConfigs{ + FlowJobName: c.FlowJobName, + TableMappings: tblMappings, + SourceName: source.GeneratePeer(t).Name, + DestinationName: c.Destination, + SyncedAtColName: "_PEERDB_SYNCED_AT", + IdleTimeoutSeconds: 15, + } + if c.SoftDelete { + ret.SoftDeleteColName = "_PEERDB_IS_DELETED" + } + return ret +} + +func (c *FlowConnectionGenerationConfig) GeneratePostgresFlowConnectionConfigs( + t *testing.T, +) *protos.FlowConnectionConfigs { t.Helper() tblMappings := c.TableMappings if tblMappings == nil { diff --git a/flow/e2e/elasticsearch/elasticsearch.go b/flow/e2e/elasticsearch/elasticsearch.go index 9b2dd3cc9c..c8b1645bbe 100644 --- a/flow/e2e/elasticsearch/elasticsearch.go +++ b/flow/e2e/elasticsearch/elasticsearch.go @@ -54,7 +54,7 @@ func SetupSuite(t *testing.T) elasticsearchSuite { return elasticsearchSuite{ t: t, - conn: conn, + conn: conn.PostgresConnector, esClient: esClient, esAddresses: esAddresses, suffix: suffix, diff --git a/flow/e2e/elasticsearch/peer_flow_es_test.go b/flow/e2e/elasticsearch/peer_flow_es_test.go index d7f471dde6..07aff0d918 100644 --- a/flow/e2e/elasticsearch/peer_flow_es_test.go +++ b/flow/e2e/elasticsearch/peer_flow_es_test.go @@ -30,7 +30,7 @@ func (s elasticsearchSuite) Test_Simple_PKey_CDC_Mirror() { TableNameMapping: map[string]string{srcTableName: srcTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 flowConnConfig.DoInitialSnapshot = true @@ -99,7 +99,7 @@ func (s elasticsearchSuite) Test_Composite_PKey_CDC_Mirror() { TableNameMapping: map[string]string{srcTableName: srcTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 flowConnConfig.DoInitialSnapshot = true diff --git a/flow/e2e/eventhub/peer_flow_eh_test.go b/flow/e2e/eventhub/peer_flow_eh_test.go index e48f959395..33a46ecb7f 100644 --- a/flow/e2e/eventhub/peer_flow_eh_test.go +++ b/flow/e2e/eventhub/peer_flow_eh_test.go @@ -112,7 +112,7 @@ func SetupSuite(t *testing.T) EventhubsSuite { return EventhubsSuite{ t: t, - conn: conn, + conn: conn.PostgresConnector, suffix: suffix, timeSuffix: tsSuffix, } @@ -150,7 +150,7 @@ func (s EventhubsSuite) Test_EH_Simple() { TableNameMapping: map[string]string{srcTableName: scopedEventhubName}, Destination: destinationPeer.Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.Script = "e2e_eh_simple_script" tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) diff --git a/flow/e2e/generic/generic_test.go b/flow/e2e/generic/generic_test.go index 3dd5af3397..b6c794c581 100644 --- a/flow/e2e/generic/generic_test.go +++ b/flow/e2e/generic/generic_test.go @@ -8,6 +8,7 @@ import ( "github.com/stretchr/testify/require" "github.com/PeerDB-io/peerdb/flow/connectors" + "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/e2e" e2e_bigquery "github.com/PeerDB-io/peerdb/flow/e2e/bigquery" e2e_clickhouse "github.com/PeerDB-io/peerdb/flow/e2e/clickhouse" @@ -32,14 +33,14 @@ func TestGenericBQ(t *testing.T) { } func TestGenericCH(t *testing.T) { - e2eshared.RunSuite(t, SetupGenericSuite(e2e_clickhouse.SetupSuite)) + e2eshared.RunSuite(t, SetupGenericSuite(e2e_clickhouse.SetupSuite(t, e2e.SetupPostgres))) } type Generic struct { - e2e.GenericSuite + e2e.GenericSuite[*connpostgres.PostgresConnector] } -func SetupGenericSuite[T e2e.GenericSuite](f func(t *testing.T) T) func(t *testing.T) Generic { +func SetupGenericSuite[T e2e.GenericSuite[*connpostgres.PostgresConnector]](f func(t *testing.T) T) func(t *testing.T) Generic { return func(t *testing.T) Generic { t.Helper() return Generic{f(t)} @@ -67,7 +68,7 @@ func (s Generic) Test_Simple_Flow() { TableMappings: e2e.TableMappings(s, srcTable, dstTable), Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(t) tc := e2e.NewTemporalClient(t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -116,7 +117,7 @@ func (s Generic) Test_Simple_Schema_Changes() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(t) // wait for PeerFlowStatusQuery to finish setup // and then insert and mutate schema repeatedly. @@ -330,7 +331,7 @@ func (s Generic) Test_Partitioned_Table() { TableMappings: e2e.TableMappings(s, srcTable, dstTable), Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(t) tc := e2e.NewTemporalClient(t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) diff --git a/flow/e2e/kafka/kafka_test.go b/flow/e2e/kafka/kafka_test.go index e6246b280f..37b7f0d177 100644 --- a/flow/e2e/kafka/kafka_test.go +++ b/flow/e2e/kafka/kafka_test.go @@ -73,7 +73,7 @@ func SetupSuite(t *testing.T) KafkaSuite { return KafkaSuite{ t: t, - conn: conn, + conn: conn.PostgresConnector, suffix: suffix, } } @@ -103,7 +103,7 @@ func (s KafkaSuite) TestSimple() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.Script = "e2e_kasimple" tc := e2e.NewTemporalClient(s.t) @@ -162,7 +162,7 @@ func (s KafkaSuite) TestMessage() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.Script = "e2e_kamessage" tc := e2e.NewTemporalClient(s.t) @@ -217,7 +217,7 @@ func (s KafkaSuite) TestDefault() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -272,7 +272,7 @@ func (s KafkaSuite) TestInitialLoad() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.DoInitialSnapshot = true _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` diff --git a/flow/e2e/mysql/mysql.go b/flow/e2e/mysql/mysql.go deleted file mode 100644 index fe5dc8d838..0000000000 --- a/flow/e2e/mysql/mysql.go +++ /dev/null @@ -1,71 +0,0 @@ -package e2e_postgres - -import ( - "context" - "fmt" - "strings" - "testing" - - "github.com/stretchr/testify/require" - - "github.com/PeerDB-io/peerdb/flow/connectors" - "github.com/PeerDB-io/peerdb/flow/connectors/mysql" - "github.com/PeerDB-io/peerdb/flow/e2e" - "github.com/PeerDB-io/peerdb/flow/generated/protos" - "github.com/PeerDB-io/peerdb/flow/model" - "github.com/PeerDB-io/peerdb/flow/shared" -) - -type PeerFlowE2ETestSuiteMySQL struct { - t *testing.T - - conn *connmysql.MySqlConnector - suffix string -} - -func (s PeerFlowE2ETestSuiteMySQL) T() *testing.T { - return s.t -} - -func (s PeerFlowE2ETestSuiteMySQL) Connector() *connmysql.MySqlConnector { - return s.conn -} - -func (s PeerFlowE2ETestSuiteMySQL) DestinationConnector() connectors.Connector { - return s.conn -} - -func (s PeerFlowE2ETestSuiteMySQL) Suffix() string { - return s.suffix -} - -func (s PeerFlowE2ETestSuiteMySQL) Peer() *protos.Peer { - return e2e.GeneratePostgresPeer(s.t) -} - -func (s PeerFlowE2ETestSuiteMySQL) DestinationTable(table string) string { - return e2e.AttachSchema(s, table) -} - -func (s PeerFlowE2ETestSuiteMySQL) GetRows(table string, cols string) (*model.QRecordBatch, error) { - s.t.Helper() - panic("TODO") -} - -func SetupSuite(t *testing.T) PeerFlowE2ETestSuiteMySQL { - t.Helper() - - suffix := "pg_" + strings.ToLower(shared.RandomString(8)) - conn, err := e2e.SetupPostgres(t, suffix) - require.NoError(t, err, "failed to setup postgres") - - return PeerFlowE2ETestSuiteMySQL{ - t: t, - conn: conn, - suffix: suffix, - } -} - -func (s PeerFlowE2ETestSuiteMySQL) Teardown() { - // TODO for mysql -} diff --git a/flow/e2e/pg.go b/flow/e2e/pg.go index 7acdbb4aab..1764d68fee 100644 --- a/flow/e2e/pg.go +++ b/flow/e2e/pg.go @@ -10,6 +10,7 @@ import ( "github.com/jackc/pgx/v5" "github.com/stretchr/testify/require" + "github.com/PeerDB-io/peerdb/flow/connectors" connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/generated/protos" "github.com/PeerDB-io/peerdb/flow/peerdbenv" @@ -96,8 +97,11 @@ func setupPostgresSchema(t *testing.T, conn *pgx.Conn, suffix string) error { return setupTx.Commit(context.Background()) } -// SetupPostgres sets up the postgres connection. -func SetupPostgres(t *testing.T, suffix string) (*connpostgres.PostgresConnector, error) { +type PostgresSource struct { + *connpostgres.PostgresConnector +} + +func SetupPostgres(t *testing.T, suffix string) (*PostgresSource, error) { t.Helper() connector, err := connpostgres.NewPostgresConnector(context.Background(), @@ -117,16 +121,18 @@ func SetupPostgres(t *testing.T, suffix string) (*connpostgres.PostgresConnector return nil, err } - return connector, nil + return &PostgresSource{PostgresConnector: connector}, nil } -func TearDownPostgres[T Suite](s T) { - t := s.T() +func (s *PostgresSource) Connector() connectors.Connector { + return s.PostgresConnector +} + +func (s *PostgresSource) Teardown(t *testing.T, suffix string) { t.Helper() - conn := s.Connector().Conn() - if conn != nil { - suffix := s.Suffix() + if s.PostgresConnector != nil { + conn := s.PostgresConnector.Conn() t.Log("begin tearing down postgres schema", suffix) deadline := time.Now().Add(2 * time.Minute) for { @@ -142,7 +148,33 @@ func TearDownPostgres[T Suite](s T) { } } -// GeneratePostgresPeer generates a postgres peer config for testing. +func TearDownPostgres(s Suite[*connpostgres.PostgresConnector]) { + t := s.T() + t.Helper() + + conn := s.Connector() + if conn != nil { + conn := s.Connector().Conn() + t.Log("begin tearing down postgres schema", s.Suffix()) + deadline := time.Now().Add(2 * time.Minute) + for { + err := cleanPostgres(conn, s.Suffix()) + if err == nil { + conn.Close(context.Background()) + return + } else if time.Now().After(deadline) { + require.Fail(t, "failed to teardown postgres schema", "%s: %v", s.Suffix(), err) + } + time.Sleep(time.Second) + } + } +} + +func (s *PostgresSource) GeneratePeer(t *testing.T) *protos.Peer { + t.Helper() + return GeneratePostgresPeer(t) +} + func GeneratePostgresPeer(t *testing.T) *protos.Peer { t.Helper() peer := &protos.Peer{ diff --git a/flow/e2e/postgres/peer_flow_pg_test.go b/flow/e2e/postgres/peer_flow_pg_test.go index 1c90ab0fed..85ecae24a3 100644 --- a/flow/e2e/postgres/peer_flow_pg_test.go +++ b/flow/e2e/postgres/peer_flow_pg_test.go @@ -67,7 +67,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Geospatial_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 tc := e2e.NewTemporalClient(s.t) @@ -112,7 +112,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Types_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 flowConnConfig.SoftDeleteColName = "" flowConnConfig.SyncedAtColName = "" @@ -179,7 +179,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Enums_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -221,7 +221,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Composite_PKey_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -283,7 +283,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Composite_PKey_Toast_1_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -348,7 +348,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Composite_PKey_Toast_2_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -406,7 +406,7 @@ func (s PeerFlowE2ETestSuitePG) Test_PeerDB_Columns() { SoftDelete: true, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -825,7 +825,7 @@ func (s PeerFlowE2ETestSuitePG) Test_ContinueAsNew() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 2 flowConnConfig.IdleTimeoutSeconds = 10 @@ -966,12 +966,14 @@ func (s PeerFlowE2ETestSuitePG) Test_Dynamic_Mirror_Config_Via_Signals() { func (s PeerFlowE2ETestSuitePG) Test_CustomSync() { srcTableName := s.attachSchemaSuffix("test_customsync") dstTableName := s.attachSchemaSuffix("test_customsync_dst") + connectionGen := e2e.FlowConnectionGenerationConfig{ FlowJobName: s.attachSuffix("test_customsync_flow"), TableNameMapping: map[string]string{srcTableName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + _, err := s.Conn().Exec(context.Background(), fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id SERIAL PRIMARY KEY, @@ -979,26 +981,31 @@ func (s PeerFlowE2ETestSuitePG) Test_CustomSync() { value TEXT NOT NULL ); `, srcTableName)) + require.NoError(s.t, err) tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) + e2e.SignalWorkflow(env, model.FlowSignal, model.PauseSignal) e2e.EnvWaitFor(s.t, env, 1*time.Minute, "paused workflow", func() bool { return e2e.EnvGetFlowStatus(s.t, env) == protos.FlowStatus_STATUS_PAUSED }) + e2e.SignalWorkflow(env, model.CDCDynamicPropertiesSignal, &protos.CDCFlowConfigUpdate{ NumberOfSyncs: 1, }) e2e.EnvWaitFor(s.t, env, 1*time.Minute, "resumed workflow", func() bool { return e2e.EnvGetFlowStatus(s.t, env) == protos.FlowStatus_STATUS_RUNNING }) + _, err = s.Conn().Exec(context.Background(), fmt.Sprintf( "INSERT INTO %s(key, value) VALUES ('test_key', 'test_value')", srcTableName)) e2e.EnvNoError(s.t, env, err) e2e.EnvWaitFor(s.t, env, 3*time.Minute, "paused workflow", func() bool { return e2e.EnvGetFlowStatus(s.t, env) == protos.FlowStatus_STATUS_PAUSED }) + require.NoError(s.t, s.comparePGTables(srcTableName, dstTableName, "id,key,value")) env.Cancel() e2e.RequireEnvCanceled(s.t, env) @@ -1033,7 +1040,7 @@ func (s PeerFlowE2ETestSuitePG) Test_TypeSystem_PG() { TableNameMapping: map[string]string{srcTableName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.DoInitialSnapshot = true flowConnConfig.System = protos.TypeSystem_PG flowConnConfig.SoftDeleteColName = "" @@ -1082,7 +1089,7 @@ func (s PeerFlowE2ETestSuitePG) Test_TransformRecordScript() { TableNameMapping: map[string]string{srcTableName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.Script = "cdc_transform_record" tc := e2e.NewTemporalClient(s.t) @@ -1131,7 +1138,7 @@ func (s PeerFlowE2ETestSuitePG) Test_TransformRowScript() { TableNameMapping: map[string]string{srcTableName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.Script = "cdc_transform_row" tc := e2e.NewTemporalClient(s.t) diff --git a/flow/e2e/postgres/postgres.go b/flow/e2e/postgres/postgres.go index 4491c91a42..7bc32a3be8 100644 --- a/flow/e2e/postgres/postgres.go +++ b/flow/e2e/postgres/postgres.go @@ -74,7 +74,7 @@ func SetupSuite(t *testing.T) PeerFlowE2ETestSuitePG { return PeerFlowE2ETestSuitePG{ t: t, - conn: conn, + conn: conn.PostgresConnector, suffix: suffix, } } diff --git a/flow/e2e/pubsub/pubsub_test.go b/flow/e2e/pubsub/pubsub_test.go index 53ea57c32d..742f403e3c 100644 --- a/flow/e2e/pubsub/pubsub_test.go +++ b/flow/e2e/pubsub/pubsub_test.go @@ -108,7 +108,7 @@ func SetupSuite(t *testing.T) PubSubSuite { return PubSubSuite{ t: t, - conn: conn, + conn: conn.PostgresConnector, suffix: suffix, } } @@ -141,7 +141,7 @@ func (s PubSubSuite) TestCreateTopic() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer(sa).Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.Script = "e2e_pscreate" tc := e2e.NewTemporalClient(s.t) @@ -193,7 +193,7 @@ func (s PubSubSuite) TestSimple() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer(sa).Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.Script = "e2e_pssimple" psclient, err := sa.CreatePubSubClient(context.Background()) @@ -263,7 +263,7 @@ func (s PubSubSuite) TestInitialLoad() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer(sa).Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.Script = "e2e_psinitial" flowConnConfig.DoInitialSnapshot = true diff --git a/flow/e2e/s3/cdc_s3_test.go b/flow/e2e/s3/cdc_s3_test.go index 05e6964cad..48fcfbf40e 100644 --- a/flow/e2e/s3/cdc_s3_test.go +++ b/flow/e2e/s3/cdc_s3_test.go @@ -39,7 +39,7 @@ func (s PeerFlowE2ETestSuiteS3) Test_Complete_Simple_Flow_S3() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 5 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) diff --git a/flow/e2e/s3/qrep_flow_s3_test.go b/flow/e2e/s3/qrep_flow_s3_test.go index f8e886073a..e8f49c3d2e 100644 --- a/flow/e2e/s3/qrep_flow_s3_test.go +++ b/flow/e2e/s3/qrep_flow_s3_test.go @@ -83,7 +83,7 @@ func setupSuite(t *testing.T, s3environment S3Environment) PeerFlowE2ETestSuiteS return PeerFlowE2ETestSuiteS3{ t: t, - conn: conn, + conn: conn.PostgresConnector, s3Helper: helper, suffix: suffix, } diff --git a/flow/e2e/snowflake/peer_flow_sf_test.go b/flow/e2e/snowflake/peer_flow_sf_test.go index e0bf746a8b..a74117bbad 100644 --- a/flow/e2e/snowflake/peer_flow_sf_test.go +++ b/flow/e2e/snowflake/peer_flow_sf_test.go @@ -58,7 +58,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Flow_ReplicaIdentity_Index_No_Pkey() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -111,7 +111,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Invalid_Numeric() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) @@ -165,7 +165,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Invalid_Geo_SF_Avro_CDC() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) // wait for PeerFlowStatusQuery to finish setup // and then insert 10 rows into the source table @@ -255,7 +255,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Toast_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -306,7 +306,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Toast_Advance_1_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -362,7 +362,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Toast_Advance_2_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -413,7 +413,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Toast_Advance_3_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -471,7 +471,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Types_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -566,7 +566,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Multi_Table_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -621,7 +621,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Composite_PKey_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -676,7 +676,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Composite_PKey_Toast_1_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 flowConnConfig.SoftDeleteColName = "" flowConnConfig.SyncedAtColName = "" @@ -738,7 +738,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Composite_PKey_Toast_2_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -1145,7 +1145,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Supported_Mixed_Case_Table_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup diff --git a/flow/e2e/snowflake/snowflake.go b/flow/e2e/snowflake/snowflake.go index 35fac5f2a8..8a741852b7 100644 --- a/flow/e2e/snowflake/snowflake.go +++ b/flow/e2e/snowflake/snowflake.go @@ -99,7 +99,7 @@ func SetupSuite(t *testing.T) PeerFlowE2ETestSuiteSF { suite := PeerFlowE2ETestSuiteSF{ t: t, pgSuffix: pgSuffix, - conn: conn, + conn: conn.PostgresConnector, sfHelper: sfHelper, connector: connector, } diff --git a/flow/e2e/sqlserver/qrep_flow_sqlserver_test.go b/flow/e2e/sqlserver/qrep_flow_sqlserver_test.go index cdf89da88b..230a092dc2 100644 --- a/flow/e2e/sqlserver/qrep_flow_sqlserver_test.go +++ b/flow/e2e/sqlserver/qrep_flow_sqlserver_test.go @@ -92,7 +92,7 @@ func SetupSuite(t *testing.T) PeerFlowE2ETestSuiteSQLServer { return PeerFlowE2ETestSuiteSQLServer{ t: t, - conn: conn, + conn: conn.PostgresConnector, sqlsHelper: sqlsHelper, suffix: suffix, } diff --git a/flow/e2e/test_utils.go b/flow/e2e/test_utils.go index ad1109cea8..d2763ab720 100644 --- a/flow/e2e/test_utils.go +++ b/flow/e2e/test_utils.go @@ -22,6 +22,7 @@ import ( "go.temporal.io/sdk/temporal" "github.com/PeerDB-io/peerdb/flow/connectors" + connmysql "github.com/PeerDB-io/peerdb/flow/connectors/mysql" connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" connsnowflake "github.com/PeerDB-io/peerdb/flow/connectors/snowflake" "github.com/PeerDB-io/peerdb/flow/e2eshared" @@ -100,39 +101,57 @@ func GetPgRows(conn *connpostgres.PostgresConnector, suffix string, table string ) } -func RequireEqualTables(suite RowSource, table string, cols string) { +func GetMySqlRows(conn *connmysql.MySqlConnector, suffix string, table string, cols string) (*model.QRecordBatch, error) { + // TODO mysql + return nil, nil +} + +func GetSuiteSourceRows[TSource connectors.Connector](suite Suite[TSource], table string, cols string) (*model.QRecordBatch, error) { + switch conn := any(suite.Connector()).(type) { + case *connpostgres.PostgresConnector: + return GetPgRows(conn, suite.Suffix(), table, cols) + case *connmysql.MySqlConnector: + return GetMySqlRows(conn, suite.Suffix(), table, cols) + default: + panic("unknown connector type") + } +} + +func RequireEqualTables[TSource connectors.Connector](suite RowSource[TSource], table string, cols string) { t := suite.T() t.Helper() - pgRows, err := GetPgRows(suite.Connector(), suite.Suffix(), table, cols) + sourceRows, err := GetSuiteSourceRows(suite, table, cols) require.NoError(t, err) rows, err := suite.GetRows(table, cols) require.NoError(t, err) - require.True(t, e2eshared.CheckEqualRecordBatches(t, pgRows, rows)) + require.True(t, e2eshared.CheckEqualRecordBatches(t, sourceRows, rows)) } -func EnvEqualTables(env WorkflowRun, suite RowSource, table string, cols string) { +func EnvEqualTables[TSource connectors.Connector](env WorkflowRun, suite RowSource[TSource], table string, cols string) { EnvEqualTablesWithNames(env, suite, table, table, cols) } -func EnvEqualTablesWithNames(env WorkflowRun, suite RowSource, srcTable string, dstTable string, cols string) { +func EnvEqualTablesWithNames[TSource connectors.Connector]( + env WorkflowRun, suite RowSource[TSource], srcTable string, dstTable string, cols string, +) { t := suite.T() t.Helper() - pgRows, err := GetPgRows(suite.Connector(), suite.Suffix(), srcTable, cols) + sourceRows, err := GetSuiteSourceRows(suite, srcTable, cols) EnvNoError(t, env, err) rows, err := suite.GetRows(dstTable, cols) EnvNoError(t, env, err) - EnvEqualRecordBatches(t, env, pgRows, rows) + EnvEqualRecordBatches(t, env, sourceRows, rows) } -func EnvWaitForEqualTables( +func EnvWaitForEqualTables[TSource connectors.Connector]( env WorkflowRun, - suite RowSource, + suite RowSource[TSource], reason string, table string, cols string, @@ -141,9 +160,9 @@ func EnvWaitForEqualTables( EnvWaitForEqualTablesWithNames(env, suite, reason, table, table, cols) } -func EnvWaitForEqualTablesWithNames( +func EnvWaitForEqualTablesWithNames[TSource connectors.Connector]( env WorkflowRun, - suite RowSource, + suite RowSource[TSource], reason string, srcTable string, dstTable string, @@ -155,7 +174,7 @@ func EnvWaitForEqualTablesWithNames( EnvWaitFor(t, env, 3*time.Minute, reason, func() bool { t.Helper() - pgRows, err := GetPgRows(suite.Connector(), suite.Suffix(), srcTable, cols) + sourceRows, err := GetSuiteSourceRows(suite, srcTable, cols) if err != nil { t.Log(err) return false @@ -167,13 +186,13 @@ func EnvWaitForEqualTablesWithNames( return false } - return e2eshared.CheckEqualRecordBatches(t, pgRows, rows) + return e2eshared.CheckEqualRecordBatches(t, sourceRows, rows) }) } -func EnvWaitForCount( +func EnvWaitForCount[TSource connectors.Connector]( env WorkflowRun, - suite RowSource, + suite RowSource[TSource], reason string, dstTable string, cols string, From 088a077d4dae6b6a4abc4c0e51e8e19dbacd0c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 8 Jan 2025 17:51:46 +0000 Subject: [PATCH 32/80] run mysql in ci --- .github/workflows/flow.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/flow.yml b/.github/workflows/flow.yml index 47fee98492..2251082ab1 100644 --- a/.github/workflows/flow.yml +++ b/.github/workflows/flow.yml @@ -24,6 +24,12 @@ jobs: POSTGRES_PASSWORD: postgres POSTGRES_DB: postgres POSTGRES_INITDB_ARGS: --locale=C.UTF-8 + mysql: + image: mariadb:lts-ubi + ports: + - 3306:3306 + env: + MARIADB_ROOT_PASSWORD: maria redpanda: image: redpandadata/redpanda@sha256:7214ddaf8426d25936459cf77c1f905566a4483a97d2b13006120dcd98a5c846 ports: From 8ad297936c4d85364e9a15f7f47805cf2dd5aa45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 8 Jan 2025 22:10:22 +0000 Subject: [PATCH 33/80] mysql setup/teardown mysql databases & schemas are the same thing --- flow/e2e/mysql.go | 73 +++++++++++++++++++++++++++++++++++++++++++++++ flow/e2e/pg.go | 7 ++--- 2 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 flow/e2e/mysql.go diff --git a/flow/e2e/mysql.go b/flow/e2e/mysql.go new file mode 100644 index 0000000000..214d74935a --- /dev/null +++ b/flow/e2e/mysql.go @@ -0,0 +1,73 @@ +package e2e + +import ( + "context" + "fmt" + "testing" + + "github.com/PeerDB-io/peerdb/flow/connectors" + "github.com/PeerDB-io/peerdb/flow/connectors/mysql" + "github.com/PeerDB-io/peerdb/flow/generated/protos" +) + +type MySqlSource struct { + *connmysql.MySqlConnector +} + +var mysqlConfig = &protos.MySqlConfig{ + Host: "localhost", + Port: 3306, + User: "root", + Password: "maria", + Database: "default", + Setup: nil, + Compression: 0, + DisableTls: true, + Flavor: "mariadb", +} + +func SetupMySQL(t *testing.T, suffix string) (*MySqlSource, error) { + t.Helper() + + connector, err := connmysql.NewMySqlConnector(context.Background(), mysqlConfig) + if err != nil { + return nil, fmt.Errorf("failed to create postgres connection: %w", err) + } + + if _, err := connector.Execute(context.Background(), "DROP DATABASE IF EXISTS e2e_test_"+suffix); err != nil { + connector.Close() + return nil, err + } + + if _, err := connector.Execute(context.Background(), "CREATE DATABASE e2e_test_"+suffix); err != nil { + connector.Close() + return nil, err + } + + return &MySqlSource{MySqlConnector: connector}, nil +} + +func (s *MySqlSource) Connector() connectors.Connector { + return s.MySqlConnector +} + +func (s *MySqlSource) Teardown(t *testing.T, suffix string) { + t.Helper() + if _, err := s.MySqlConnector.Execute(context.Background(), "DROP DATABASE IF EXISTS e2e_test_"+suffix); err != nil { + t.Log("failed to drop mysql database", err) + s.MySqlConnector.Close() + } +} + +func (s *MySqlSource) GeneratePeer(t *testing.T) *protos.Peer { + t.Helper() + peer := &protos.Peer{ + Name: "catalog", + Type: protos.DBType_MYSQL, + Config: &protos.Peer_MysqlConfig{ + MysqlConfig: mysqlConfig, + }, + } + CreatePeer(t, peer) + return peer +} diff --git a/flow/e2e/pg.go b/flow/e2e/pg.go index 1764d68fee..398bb0885d 100644 --- a/flow/e2e/pg.go +++ b/flow/e2e/pg.go @@ -72,15 +72,14 @@ func setupPostgresSchema(t *testing.T, conn *pgx.Conn, suffix string) error { } }() - // create an e2e_test schema if _, err := setupTx.Exec(context.Background(), "CREATE SCHEMA e2e_test_"+suffix); err != nil { return fmt.Errorf("failed to create e2e_test schema: %w", err) } if _, err := setupTx.Exec(context.Background(), ` - CREATE OR REPLACE FUNCTION random_string( int ) RETURNS TEXT as $$ - SELECT string_agg(substring('0123456789bcdfghjkmnpqrstvwxyz', - round(random() * 30)::integer, 1), '') FROM generate_series(1, $1); + CREATE OR REPLACE FUNCTION random_string(int) RETURNS TEXT as $$ + SELECT string_agg(substring('0123456789bcdefghijkmnpqrstvwxyz', + round(random() * 32)::integer, 1), '') FROM generate_series(1, $1); $$ language sql; CREATE OR REPLACE FUNCTION random_bytea(bytea_length integer) RETURNS bytea AS $body$ From ff24e2d1c8ec5100d7a0666b7b1953c4ee407292 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 9 Jan 2025 00:36:34 +0000 Subject: [PATCH 34/80] setup mysql source, skip when test relies on Connector --- flow/e2e/clickhouse/clickhouse.go | 6 +++++- flow/e2e/clickhouse/peer_flow_ch_test.go | 2 +- flow/e2e/mysql.go | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/flow/e2e/clickhouse/clickhouse.go b/flow/e2e/clickhouse/clickhouse.go index 1ce59cb986..b89fa6e052 100644 --- a/flow/e2e/clickhouse/clickhouse.go +++ b/flow/e2e/clickhouse/clickhouse.go @@ -35,7 +35,11 @@ func (s ClickHouseSuite) T() *testing.T { } func (s ClickHouseSuite) Connector() *connpostgres.PostgresConnector { - return s.source.Connector().(*connpostgres.PostgresConnector) + c, ok := s.source.Connector().(*connpostgres.PostgresConnector) + if !ok { + s.t.Skipf("skipping test because it relies on PostgresConnector, while source is %T", s.source) + } + return c } func (s ClickHouseSuite) DestinationConnector() connectors.Connector { diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index 68700c3d65..987bf592c7 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -31,7 +31,7 @@ func TestPeerFlowE2ETestSuitePG_CH(t *testing.T) { } func TestPeerFlowE2ETestSuiteMySQL_CH(t *testing.T) { - // TODO e2eshared.RunSuite(t, SetupSuite(t, e2e.SetupMySQL)) + e2eshared.RunSuite(t, SetupSuite(t, e2e.SetupMySQL)) } func (s ClickHouseSuite) attachSchemaSuffix(tableName string) string { diff --git a/flow/e2e/mysql.go b/flow/e2e/mysql.go index 214d74935a..4199fc7dc4 100644 --- a/flow/e2e/mysql.go +++ b/flow/e2e/mysql.go @@ -19,7 +19,7 @@ var mysqlConfig = &protos.MySqlConfig{ Port: 3306, User: "root", Password: "maria", - Database: "default", + Database: "", Setup: nil, Compression: 0, DisableTls: true, @@ -62,7 +62,7 @@ func (s *MySqlSource) Teardown(t *testing.T, suffix string) { func (s *MySqlSource) GeneratePeer(t *testing.T) *protos.Peer { t.Helper() peer := &protos.Peer{ - Name: "catalog", + Name: "mysql", Type: protos.DBType_MYSQL, Config: &protos.Peer_MysqlConfig{ MysqlConfig: mysqlConfig, From 3b5a906ac0c2da71cc783f516b9980afca293698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 9 Jan 2025 13:50:57 +0000 Subject: [PATCH 35/80] GetMySqlRows --- flow/connectors/mysql/cdc.go | 10 +++++----- flow/connectors/mysql/mysql.go | 21 ++++++++++++++++++++- flow/connectors/mysql/qrep.go | 22 +++++----------------- flow/e2e/test_utils.go | 34 ++++++++++++++++++++++++++++++++-- 4 files changed, 62 insertions(+), 25 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 33517dcf2f..7132b8a48f 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -315,7 +315,7 @@ func (c *MySqlConnector) PullRecords( items := model.NewRecordItems(len(row)) for idx, val := range row { fd := schema.Columns[idx] - items.AddColumn(fd.Name, qvalueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + items.AddColumn(fd.Name, QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } recordCount += 1 @@ -335,13 +335,13 @@ func (c *MySqlConnector) PullRecords( oldItems := model.NewRecordItems(len(oldRow)) for idx, val := range oldRow { fd := schema.Columns[idx] - oldItems.AddColumn(fd.Name, qvalueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + oldItems.AddColumn(fd.Name, QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } newRow := ev.Rows[idx+1] newItems := model.NewRecordItems(len(newRow)) for idx, val := range ev.Rows[idx+1] { fd := schema.Columns[idx] - newItems.AddColumn(fd.Name, qvalueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + newItems.AddColumn(fd.Name, QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } recordCount += 1 @@ -360,7 +360,7 @@ func (c *MySqlConnector) PullRecords( items := model.NewRecordItems(len(row)) for idx, val := range row { fd := schema.Columns[idx] - items.AddColumn(fd.Name, qvalueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + items.AddColumn(fd.Name, QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) } recordCount += 1 @@ -380,7 +380,7 @@ func (c *MySqlConnector) PullRecords( return nil } -func qvalueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) qvalue.QValue { +func QValueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) qvalue.QValue { // TODO signedness, in ev.Table, need to extend QValue system // See go-mysql row_event.go for mapping switch val := val.(type) { diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 3401dc2e21..addf7df25d 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -287,7 +287,26 @@ func qkindFromMysql(ty uint8) (qvalue.QValueKind, error) { } } -func qvalueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qvalue.QValue, error) { +func QRecordSchemaFromMysqlFields(fields []*mysql.Field) (qvalue.QRecordSchema, error) { + schema := make([]qvalue.QField, 0, len(fields)) + for _, field := range fields { + qkind, err := qkindFromMysql(field.Type) + if err != nil { + return qvalue.QRecordSchema{}, err + } + + schema = append(schema, qvalue.QField{ + Name: string(field.Name), + Type: qkind, + Precision: 0, // TODO numerics + Scale: 0, // TODO numerics + Nullable: (field.Flag & mysql.NOT_NULL_FLAG) == 0, + }) + } + return qvalue.QRecordSchema{Fields: schema}, nil +} + +func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qvalue.QValue, error) { // TODO fill this in, maybe contribute upstream, figvure out how numeric etc fit in switch v := fv.Value().(type) { case nil: diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 292ffad786..437625be94 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -138,7 +138,6 @@ func (c *MySqlConnector) GetQRepPartitions( return partitionHelper.GetPartitions(), nil } -// TODO use ExecuteStreamingSelect func (c *MySqlConnector) PullQRepRecords( ctx context.Context, config *protos.QRepConfig, @@ -154,22 +153,11 @@ func (c *MySqlConnector) PullQRepRecords( totalRecords := 0 onResult := func(rs *mysql.Result) error { - schema := make([]qvalue.QField, 0, len(rs.Fields)) - for _, field := range rs.Fields { - qkind, err := qkindFromMysql(field.Type) - if err != nil { - return err - } - - schema = append(schema, qvalue.QField{ - Name: string(field.Name), - Type: qkind, - Precision: 0, // TODO numerics - Scale: 0, // TODO numerics - Nullable: (field.Flag & mysql.NOT_NULL_FLAG) == 0, - }) + schema, err := QRecordSchemaFromMysqlFields(rs.Fields) + if err != nil { + return err } - stream.SetSchema(qvalue.QRecordSchema{Fields: schema}) + stream.SetSchema(schema) return nil } onRow := func(row []mysql.FieldValue) error { @@ -180,7 +168,7 @@ func (c *MySqlConnector) PullQRepRecords( } record := make([]qvalue.QValue, 0, len(row)) for idx, val := range row { - qv, err := qvalueFromMysqlFieldValue(schema.Fields[idx].Type, val) + qv, err := QValueFromMysqlFieldValue(schema.Fields[idx].Type, val) if err != nil { return err } diff --git a/flow/e2e/test_utils.go b/flow/e2e/test_utils.go index d2763ab720..46a1fd2da5 100644 --- a/flow/e2e/test_utils.go +++ b/flow/e2e/test_utils.go @@ -102,11 +102,41 @@ func GetPgRows(conn *connpostgres.PostgresConnector, suffix string, table string } func GetMySqlRows(conn *connmysql.MySqlConnector, suffix string, table string, cols string) (*model.QRecordBatch, error) { - // TODO mysql - return nil, nil + rs, err := conn.Execute( + context.Background(), + fmt.Sprintf(`SELECT %s FROM e2e_test_%s.%s ORDER BY id`, cols, suffix, connpostgres.QuoteIdentifier(table)), + ) + if err != nil { + return nil, err + } + + schema, err := connmysql.QRecordSchemaFromMysqlFields(rs.Fields) + if err != nil { + return nil, err + } + + batch := &model.QRecordBatch{ + Schema: schema, + Records: nil, + } + + for _, row := range rs.Values { + record := make([]qvalue.QValue, 0, len(row)) + for idx, val := range row { + qv, err := connmysql.QValueFromMysqlFieldValue(schema.Fields[idx].Type, val) + if err != nil { + return nil, err + } + record = append(record, qv) + } + batch.Records = append(batch.Records, record) + } + + return batch, nil } func GetSuiteSourceRows[TSource connectors.Connector](suite Suite[TSource], table string, cols string) (*model.QRecordBatch, error) { + // TODO move to SuiteSource switch conn := any(suite.Connector()).(type) { case *connpostgres.PostgresConnector: return GetPgRows(conn, suite.Suffix(), table, cols) From 1af056cc8a16b935ead036da5476becde0f13bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 9 Jan 2025 15:06:48 +0000 Subject: [PATCH 36/80] SuiteSource Exec. Requires some care that SQL is compatible with pg & mysql. eg key is keyword in MySQL --- flow/e2e/clickhouse/peer_flow_ch_test.go | 29 +++++++++--------------- flow/e2e/congen.go | 1 + flow/e2e/mysql.go | 5 ++++ flow/e2e/pg.go | 5 ++++ 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index 987bf592c7..eb7f4e9b36 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -173,14 +173,10 @@ func (s ClickHouseSuite) Test_Addition_Removal() { afterRemoveRunID := e2e.EnvGetRunID(s.t, env) require.NotEqual(s.t, runID, afterRemoveRunID) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s (key) VALUES ('test'); - `, srcTableName)) + _, err = s.Conn().Exec(context.Background(), fmt.Sprintf("INSERT INTO %s (key) VALUES ('test')", srcTableName)) require.NoError(s.t, err) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s (key) VALUES ('test'); - `, addedSrcTableName)) + _, err = s.Conn().Exec(context.Background(), fmt.Sprintf("INSERT INTO %s (key) VALUES ('test')", addedSrcTableName)) require.NoError(s.t, err) e2e.EnvWaitForEqualTablesWithNames(env, s, "second insert to added table", "test_table_add_remove_added", addedDstTableName, "id,key") @@ -298,19 +294,17 @@ func (s ClickHouseSuite) Test_Date32() { srcFullName := s.attachSchemaSuffix("test_date32") dstTableName := "test_date32_dst" - _, err := s.Conn().Exec(context.Background(), fmt.Sprintf(` + require.NoError(s.t, s.source.Exec(fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id SERIAL PRIMARY KEY, - key TEXT NOT NULL, + "key" TEXT NOT NULL, d DATE NOT NULL ); - `, srcFullName)) - require.NoError(s.t, err) + `, srcFullName))) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s (key,d) VALUES ('init','1935-01-01'); - `, srcFullName)) - require.NoError(s.t, err) + require.NoError(s.t, s.source.Exec( + fmt.Sprintf(`INSERT INTO %s ("key",d) VALUES ('init','1935-01-01')`, srcFullName), + )) connectionGen := e2e.FlowConnectionGenerationConfig{ FlowJobName: s.attachSuffix("clickhouse_date32"), @@ -326,10 +320,9 @@ func (s ClickHouseSuite) Test_Date32() { e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key,d") - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s (key,d) VALUES ('cdc','1935-01-01'); - `, srcFullName)) - require.NoError(s.t, err) + require.NoError(s.t, s.source.Exec( + fmt.Sprintf(`INSERT INTO %s ("key",d) VALUES ('cdc','1935-01-01')`, srcFullName), + )) e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,key,d") diff --git a/flow/e2e/congen.go b/flow/e2e/congen.go index 494de06521..7449281bb3 100644 --- a/flow/e2e/congen.go +++ b/flow/e2e/congen.go @@ -16,6 +16,7 @@ type SuiteSource interface { Teardown(t *testing.T, suffix string) GeneratePeer(t *testing.T) *protos.Peer Connector() connectors.Connector + Exec(sql string) error } func TableMappings[TSource connectors.Connector](s GenericSuite[TSource], tables ...string) []*protos.TableMapping { diff --git a/flow/e2e/mysql.go b/flow/e2e/mysql.go index 4199fc7dc4..5446bd9719 100644 --- a/flow/e2e/mysql.go +++ b/flow/e2e/mysql.go @@ -71,3 +71,8 @@ func (s *MySqlSource) GeneratePeer(t *testing.T) *protos.Peer { CreatePeer(t, peer) return peer } + +func (s *MySqlSource) Exec(sql string) error { + _, err := s.MySqlConnector.Execute(context.Background(), sql) + return err +} diff --git a/flow/e2e/pg.go b/flow/e2e/pg.go index 398bb0885d..8f1146d51b 100644 --- a/flow/e2e/pg.go +++ b/flow/e2e/pg.go @@ -186,3 +186,8 @@ func GeneratePostgresPeer(t *testing.T) *protos.Peer { CreatePeer(t, peer) return peer } + +func (s *PostgresSource) Exec(sql string) error { + _, err := s.PostgresConnector.Conn().Exec(context.Background(), sql) + return err +} From c45fa8a654325c79faaca14911e2e94d6e7276db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 00:58:57 +0000 Subject: [PATCH 37/80] looks like serial is unsigned --- flow/connectors/mysql/mysql.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index addf7df25d..1273a90937 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -5,7 +5,6 @@ package connmysql import ( "context" "crypto/tls" - "errors" "fmt" "log/slog" "time" @@ -313,7 +312,16 @@ func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qv return qvalue.QValueNull(qkind), nil case uint64: // TODO unsigned integers - return nil, errors.New("mysql unsigned integers not supported") + switch qkind { + case qvalue.QValueKindInt16: + return qvalue.QValueInt16{Val: int16(v)}, nil + case qvalue.QValueKindInt32: + return qvalue.QValueInt32{Val: int32(v)}, nil + case qvalue.QValueKindInt64: + return qvalue.QValueInt64{Val: int64(v)}, nil + default: + return nil, fmt.Errorf("cannot convert int to %s", qkind) + } case int64: switch qkind { case qvalue.QValueKindInt16: From 4b141eeb6a5f973b9d9ca2a56d246aba12fc629b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 01:27:04 +0000 Subject: [PATCH 38/80] log date's bytes --- flow/connectors/mysql/mysql.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 1273a90937..f64bb79278 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -320,7 +320,7 @@ func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qv case qvalue.QValueKindInt64: return qvalue.QValueInt64{Val: int64(v)}, nil default: - return nil, fmt.Errorf("cannot convert int to %s", qkind) + return nil, fmt.Errorf("cannot convert uint64 to %s", qkind) } case int64: switch qkind { @@ -331,7 +331,7 @@ func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qv case qvalue.QValueKindInt64: return qvalue.QValueInt64{Val: v}, nil default: - return nil, fmt.Errorf("cannot convert int to %s", qkind) + return nil, fmt.Errorf("cannot convert int64 to %s", qkind) } case float64: switch qkind { @@ -340,7 +340,7 @@ func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qv case qvalue.QValueKindFloat64: return qvalue.QValueFloat64{Val: float64(v)}, nil default: - return nil, fmt.Errorf("cannot convert float to %s", qkind) + return nil, fmt.Errorf("cannot convert float64 to %s", qkind) } case []byte: switch qkind { @@ -351,7 +351,7 @@ func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qv case qvalue.QValueKindJSON: return qvalue.QValueJSON{Val: string(v)}, nil default: - return nil, fmt.Errorf("cannot convert string to %s", qkind) + return nil, fmt.Errorf("cannot convert string %v to %s", v, qkind) } default: return nil, fmt.Errorf("unexpected mysql type %T", v) From b70d03e26cea3fe8b63408f039a98974fcc068fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 02:34:45 +0000 Subject: [PATCH 39/80] date/time formats --- flow/connectors/mysql/mysql.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index f64bb79278..a308786b74 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -350,6 +350,24 @@ func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qv return qvalue.QValueBytes{Val: v}, nil case qvalue.QValueKindJSON: return qvalue.QValueJSON{Val: string(v)}, nil + case qvalue.QValueKindTimestamp: + val, err := time.Parse("2006-01-02 15:04:05.000000", string(v)) + if err != nil { + return nil, err + } + return qvalue.QValueTimestamp{Val: val}, nil + case qvalue.QValueKindTime: + val, err := time.Parse("15:04:05.000000", string(v)) + if err != nil { + return nil, err + } + return qvalue.QValueTime{Val: val}, nil + case qvalue.QValueKindDate: + val, err := time.Parse(time.DateOnly, string(v)) + if err != nil { + return nil, err + } + return qvalue.QValueDate{Val: val}, nil default: return nil, fmt.Errorf("cannot convert string %v to %s", v, qkind) } From 412d5eee56422e906f0d724a01c83fb10d8fd776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 03:53:47 +0000 Subject: [PATCH 40/80] sigh, mysql unquoted identifiers map to all caps, meanwhile postgres.. --- flow/e2e/clickhouse/peer_flow_ch_test.go | 7 ++++--- flow/e2e/mysql.go | 12 +++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index eb7f4e9b36..942366563f 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -292,6 +292,7 @@ func (s ClickHouseSuite) Test_NullableColumnSetting() { func (s ClickHouseSuite) Test_Date32() { srcTableName := "test_date32" srcFullName := s.attachSchemaSuffix("test_date32") + quotedSrcFullName := "\"" + strings.ReplaceAll(srcFullName, ".", "\".\"") + "\"" dstTableName := "test_date32_dst" require.NoError(s.t, s.source.Exec(fmt.Sprintf(` @@ -300,10 +301,10 @@ func (s ClickHouseSuite) Test_Date32() { "key" TEXT NOT NULL, d DATE NOT NULL ); - `, srcFullName))) + `, quotedSrcFullName))) require.NoError(s.t, s.source.Exec( - fmt.Sprintf(`INSERT INTO %s ("key",d) VALUES ('init','1935-01-01')`, srcFullName), + fmt.Sprintf(`INSERT INTO %s ("key",d) VALUES ('init','1935-01-01')`, quotedSrcFullName), )) connectionGen := e2e.FlowConnectionGenerationConfig{ @@ -321,7 +322,7 @@ func (s ClickHouseSuite) Test_Date32() { e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key,d") require.NoError(s.t, s.source.Exec( - fmt.Sprintf(`INSERT INTO %s ("key",d) VALUES ('cdc','1935-01-01')`, srcFullName), + fmt.Sprintf(`INSERT INTO %s ("key",d) VALUES ('cdc','1935-01-01')`, quotedSrcFullName), )) e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,key,d") diff --git a/flow/e2e/mysql.go b/flow/e2e/mysql.go index 5446bd9719..09b28d1676 100644 --- a/flow/e2e/mysql.go +++ b/flow/e2e/mysql.go @@ -34,12 +34,16 @@ func SetupMySQL(t *testing.T, suffix string) (*MySqlSource, error) { return nil, fmt.Errorf("failed to create postgres connection: %w", err) } - if _, err := connector.Execute(context.Background(), "DROP DATABASE IF EXISTS e2e_test_"+suffix); err != nil { + if _, err := connector.Execute( + context.Background(), fmt.Sprintf("DROP DATABASE IF EXISTS \"e2e_test_%s\"", suffix), + ); err != nil { connector.Close() return nil, err } - if _, err := connector.Execute(context.Background(), "CREATE DATABASE e2e_test_"+suffix); err != nil { + if _, err := connector.Execute( + context.Background(), fmt.Sprintf("CREATE DATABASE \"e2e_test_%s\"", suffix), + ); err != nil { connector.Close() return nil, err } @@ -53,7 +57,9 @@ func (s *MySqlSource) Connector() connectors.Connector { func (s *MySqlSource) Teardown(t *testing.T, suffix string) { t.Helper() - if _, err := s.MySqlConnector.Execute(context.Background(), "DROP DATABASE IF EXISTS e2e_test_"+suffix); err != nil { + if _, err := s.MySqlConnector.Execute( + context.Background(), fmt.Sprintf("DROP DATABASE IF EXISTS \"e2e_test_%s\"", suffix), + ); err != nil { t.Log("failed to drop mysql database", err) s.MySqlConnector.Close() } From 490af9a7c4714cca72174f5e8b0f439fce063e97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 05:10:01 +0000 Subject: [PATCH 41/80] try new gtid code, also keep chipping away at date32 test --- flow/connectors/mysql/cdc.go | 63 ++++++++++++++++++++++-------------- flow/e2e/test_utils.go | 2 +- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 7132b8a48f..f541ac933d 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -162,21 +162,21 @@ func (c *MySqlConnector) startSyncer() *replication.BinlogSyncer { }) } -func (c *MySqlConnector) startStreaming(pos string) (*replication.BinlogSyncer, *replication.BinlogStreamer, error) { +func (c *MySqlConnector) startStreaming(pos string) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, error) { if rest, isFile := strings.CutPrefix(pos, "!f:"); isFile { comma := strings.LastIndexByte(rest, ',') if comma == -1 { - return nil, nil, fmt.Errorf("no comma in file/pos offset %s", pos) + return nil, nil, nil, fmt.Errorf("no comma in file/pos offset %s", pos) } offset, err := strconv.ParseUint(rest[comma+1:], 16, 32) if err != nil { - return nil, nil, fmt.Errorf("invalid offset in filepos offset %s: %w", pos, err) + return nil, nil, nil, fmt.Errorf("invalid offset in filepos offset %s: %w", pos, err) } return c.startCdcStreamingFilePos(rest[:comma], uint32(offset)) } else { gset, err := mysql.ParseGTIDSet(c.config.Flavor, pos) if err != nil { - return nil, nil, err + return nil, nil, nil, err } return c.startCdcStreamingGtid(gset) } @@ -184,23 +184,25 @@ func (c *MySqlConnector) startStreaming(pos string) (*replication.BinlogSyncer, func (c *MySqlConnector) startCdcStreamingFilePos( lastOffsetName string, lastOffsetPos uint32, -) (*replication.BinlogSyncer, *replication.BinlogStreamer, error) { +) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, error) { syncer := c.startSyncer() stream, err := syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) if err != nil { syncer.Close() } - return syncer, stream, err + return syncer, stream, nil, err } -func (c *MySqlConnector) startCdcStreamingGtid(gset mysql.GTIDSet) (*replication.BinlogSyncer, *replication.BinlogStreamer, error) { +func (c *MySqlConnector) startCdcStreamingGtid( + gset mysql.GTIDSet, +) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, error) { // https://hevodata.com/learn/mysql-gtids-and-replication-set-up syncer := c.startSyncer() stream, err := syncer.StartSyncGTID(gset) if err != nil { syncer.Close() } - return syncer, stream, err + return syncer, stream, gset, err } func (c *MySqlConnector) ReplPing(context.Context) error { @@ -246,7 +248,7 @@ func (c *MySqlConnector) PullRecords( ) error { defer req.RecordStream.Close() - syncer, mystream, err := c.startStreaming(req.LastOffset.Text) + syncer, mystream, gset, err := c.startStreaming(req.LastOffset.Text) if err != nil { return err } @@ -281,28 +283,41 @@ func (c *MySqlConnector) PullRecords( ))) } + // TODO if gset == nil update pos with event.Header.LogPos + switch ev := event.Event.(type) { case *replication.RotateEvent: - req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%d", string(ev.NextLogName), ev.Position)) + if gset == nil { + req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%d", string(ev.NextLogName), ev.Position)) + } case *replication.MariadbGTIDEvent: - var err error - newset, err := ev.GTIDNext() - if err != nil { - // TODO could ignore, but then we might get stuck rereading same batch each time - return err + if gset != nil { + var err error + newset, err := ev.GTIDNext() + if err != nil { + // TODO could ignore, but then we might get stuck rereading same batch each time + return err + } + if err := gset.Update(newset.String()); err != nil { + return err + } + req.RecordStream.UpdateLatestCheckpointText(gset.String()) } - req.RecordStream.UpdateLatestCheckpointText(newset.String()) case *replication.GTIDEvent: - var err error - newset, err := ev.GTIDNext() - if err != nil { - // TODO could ignore, but then we might get stuck rereading same batch each time - return err + if gset != nil { + var err error + newset, err := ev.GTIDNext() + if err != nil { + // TODO could ignore, but then we might get stuck rereading same batch each time + return err + } + if err := gset.Update(newset.String()); err != nil { + return err + } + req.RecordStream.UpdateLatestCheckpointText(gset.String()) } - req.RecordStream.UpdateLatestCheckpointText(newset.String()) case *replication.PreviousGTIDsEvent: - // TODO is this the correct way to handle this event? - req.RecordStream.UpdateLatestCheckpointText(ev.GTIDSets) + // TODO look into this, maybe we just do gset.Update(ev.GTIDSets) case *replication.RowsEvent: sourceTableName := string(ev.Table.Schema) + "." + string(ev.Table.Table) // TODO this is fragile destinationTableName := req.TableNameMapping[sourceTableName].Name diff --git a/flow/e2e/test_utils.go b/flow/e2e/test_utils.go index 46a1fd2da5..17bcd904b9 100644 --- a/flow/e2e/test_utils.go +++ b/flow/e2e/test_utils.go @@ -104,7 +104,7 @@ func GetPgRows(conn *connpostgres.PostgresConnector, suffix string, table string func GetMySqlRows(conn *connmysql.MySqlConnector, suffix string, table string, cols string) (*model.QRecordBatch, error) { rs, err := conn.Execute( context.Background(), - fmt.Sprintf(`SELECT %s FROM e2e_test_%s.%s ORDER BY id`, cols, suffix, connpostgres.QuoteIdentifier(table)), + fmt.Sprintf(`SELECT %s FROM "e2e_test_%s".%s ORDER BY id`, cols, suffix, connpostgres.QuoteIdentifier(table)), ) if err != nil { return nil, err From 01008b0e28e5aea17caf1acbe50ca9992570a398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 15:37:30 +0000 Subject: [PATCH 42/80] log mysql Execute --- flow/connectors/mysql/mysql.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index a308786b74..c2a7428ad5 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -58,6 +58,7 @@ func (c *MySqlConnector) connect(ctx context.Context, options ...client.Option) } func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interface{}) (*mysql.Result, error) { + slog.Info("mymymy", slog.String("query", cmd), slog.Any("when", time.Now())) reconnects := 3 for { // TODO need new connection if ctx changes between calls, or make upstream PR @@ -98,6 +99,7 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, resultCb client.SelectPerResultCallback, args ...interface{}, ) error { + slog.Info("mymymy stream", slog.String("query", cmd), slog.Any("when", time.Now())) reconnects := 3 for { // TODO need new connection if ctx changes between calls, or make upstream PR From b5b3568b6bc17cf92087553a2e41bd97471d6d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 18:33:47 +0000 Subject: [PATCH 43/80] share more connect logic --- flow/connectors/mysql/mysql.go | 50 +++++++++++++++------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index c2a7428ad5..db4de0dac1 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -52,9 +52,23 @@ func (c *MySqlConnector) ConnectionActive(context.Context) error { return nil } -func (c *MySqlConnector) connect(ctx context.Context, options ...client.Option) (*client.Conn, error) { - return client.ConnectWithContext(ctx, fmt.Sprintf("%s:%d", c.config.Host, c.config.Port), - c.config.User, c.config.Password, c.config.Database, time.Minute, options...) +func (c *MySqlConnector) connect(ctx context.Context) (*client.Conn, error) { + argF := []client.Option{func(conn *client.Conn) error { + conn.SetCapability(mysql.CLIENT_COMPRESS) + if !c.config.DisableTls { + conn.SetTLSConfig(&tls.Config{MinVersion: tls.VersionTLS13}) + } + return nil + }} + conn, err := client.ConnectWithContext(ctx, fmt.Sprintf("%s:%d", c.config.Host, c.config.Port), + c.config.User, c.config.Password, c.config.Database, time.Minute, argF...) + if err != nil { + return nil, err + } + if _, err := conn.Execute("SET sql_mode = ANSI"); err != nil { + return nil, fmt.Errorf("failed to set sql_mode to ANSI: %w", err) + } + return conn, nil } func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interface{}) (*mysql.Result, error) { @@ -64,27 +78,17 @@ func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interf // TODO need new connection if ctx changes between calls, or make upstream PR if c.conn == nil { var err error - var argF []client.Option - if !c.config.DisableTls { - argF = append(argF, func(conn *client.Conn) error { - conn.SetTLSConfig(&tls.Config{MinVersion: tls.VersionTLS13}) - return nil - }) - } - c.conn, err = c.connect(ctx, argF...) + c.conn, err = c.connect(ctx) if err != nil { return nil, fmt.Errorf("failed to connect to mysql server: %w", err) } - if _, err := c.conn.Execute("SET sql_mode = ANSI"); err != nil { - return nil, fmt.Errorf("failed to set sql_mode to ANSI: %w", err) - } } rs, err := c.conn.Execute(cmd, args...) if err != nil { if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { reconnects -= 1 - c.conn.Close() + _ = c.conn.Close() c.conn = nil continue } @@ -105,27 +109,17 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, // TODO need new connection if ctx changes between calls, or make upstream PR if c.conn == nil { var err error - var argF []client.Option - if !c.config.DisableTls { - argF = append(argF, func(conn *client.Conn) error { - conn.SetTLSConfig(&tls.Config{MinVersion: tls.VersionTLS13}) - return nil - }) - } - c.conn, err = c.connect(ctx, argF...) + c.conn, err = c.connect(ctx) if err != nil { return fmt.Errorf("failed to connect to mysql server: %w", err) } - if _, err := c.conn.Execute("SET sql_mode = ANSI"); err != nil { - return fmt.Errorf("failed to set sql_mode to ANSI: %w", err) - } } - if len(args) == 0 { + if c.conn == nil && len(args) == 0 { // testing this branch being disabled if err := c.conn.ExecuteSelectStreaming(cmd, result, rowCb, resultCb); err != nil { if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { reconnects -= 1 - c.conn.Close() + _ = c.conn.Close() c.conn = nil continue } From 58df5ca13696ed567b5a399eab66cc5d0cdf6e85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 19:35:13 +0000 Subject: [PATCH 44/80] test ` --- flow/connectors/mysql/mysql.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index db4de0dac1..d9b98616a6 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -7,6 +7,7 @@ import ( "crypto/tls" "fmt" "log/slog" + "strings" "time" "github.com/go-mysql-org/go-mysql/client" @@ -115,7 +116,9 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, } } - if c.conn == nil && len(args) == 0 { // testing this branch being disabled + cmd = strings.ReplaceAll(cmd, "\"", "`") // please don't work + + if len(args) == 0 { // testing this branch being disabled if err := c.conn.ExecuteSelectStreaming(cmd, result, rowCb, resultCb); err != nil { if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { reconnects -= 1 From 7c1d600ee55f627acdb707436703424c3d2d84e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 19:44:12 +0000 Subject: [PATCH 45/80] need to close statement to not hit limits --- flow/connectors/mysql/mysql.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index d9b98616a6..84a8b7b5b8 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -7,7 +7,6 @@ import ( "crypto/tls" "fmt" "log/slog" - "strings" "time" "github.com/go-mysql-org/go-mysql/client" @@ -116,9 +115,7 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, } } - cmd = strings.ReplaceAll(cmd, "\"", "`") // please don't work - - if len(args) == 0 { // testing this branch being disabled + if c.conn == nil && len(args) == 0 { // testing this branch being disabled if err := c.conn.ExecuteSelectStreaming(cmd, result, rowCb, resultCb); err != nil { if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { reconnects -= 1 @@ -139,7 +136,9 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, } return err } - if err := stmt.ExecuteSelectStreaming(result, rowCb, resultCb, args...); err != nil { + err = stmt.ExecuteSelectStreaming(result, rowCb, resultCb, args...) + _ = stmt.Close() + if err != nil { if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { reconnects -= 1 c.conn.Close() From df2a8c8082b2844143bc0030c51f8784c5e366ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 20:16:01 +0000 Subject: [PATCH 46/80] show databases --- flow/connectors/mysql/mysql.go | 2 +- flow/connectors/mysql/qrep.go | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 84a8b7b5b8..0ece8487b0 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -115,7 +115,7 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, } } - if c.conn == nil && len(args) == 0 { // testing this branch being disabled + if len(args) == 0 { if err := c.conn.ExecuteSelectStreaming(cmd, result, rowCb, resultCb); err != nil { if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { reconnects -= 1 diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 437625be94..94e17c42f3 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -178,6 +178,17 @@ func (c *MySqlConnector) PullQRepRecords( return nil } + // testing + rs, err := c.Execute(ctx, "show databases") + if err != nil { + return 0, fmt.Errorf("mymymy err %w", err) + } + for rowIdx, row := range rs.Values { + for idx, val := range row { + c.logger.Info("mymymy show", slog.Int("rowIdx", rowIdx), slog.Int("idx", idx), slog.Any("field", string(val.AsString()))) + } + } + if last.FullTablePartition { // this is a full table partition, so just run the query var rs mysql.Result From d6e621a7e625c92ca33db923a24de8ff0e341777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 20:44:51 +0000 Subject: [PATCH 47/80] force query --- flow/connectors/mysql/qrep.go | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 94e17c42f3..fb4bceb14e 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -150,6 +150,7 @@ func (c *MySqlConnector) PullQRepRecords( if err != nil { return 0, err } + query = fmt.Sprintf("select * from %s", config.WatermarkTable) totalRecords := 0 onResult := func(rs *mysql.Result) error { @@ -178,16 +179,18 @@ func (c *MySqlConnector) PullQRepRecords( return nil } - // testing - rs, err := c.Execute(ctx, "show databases") - if err != nil { - return 0, fmt.Errorf("mymymy err %w", err) - } - for rowIdx, row := range rs.Values { - for idx, val := range row { - c.logger.Info("mymymy show", slog.Int("rowIdx", rowIdx), slog.Int("idx", idx), slog.Any("field", string(val.AsString()))) + /* + // testing + rs, err := c.Execute(ctx, "show databases") + if err != nil { + return 0, fmt.Errorf("mymymy err %w", err) } - } + for rowIdx, row := range rs.Values { + for idx, val := range row { + c.logger.Info("mymymy show", slog.Int("rowIdx", rowIdx), slog.Int("idx", idx), slog.Any("field", string(val.AsString()))) + } + } + */ if last.FullTablePartition { // this is a full table partition, so just run the query From 672c5e18212eb585f01ccc71870695057b2f591a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 21:00:10 +0000 Subject: [PATCH 48/80] show tables --- flow/connectors/mysql/qrep.go | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index fb4bceb14e..83c35b1761 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "log/slog" + "strings" "text/template" "github.com/go-mysql-org/go-mysql/mysql" @@ -150,7 +151,6 @@ func (c *MySqlConnector) PullQRepRecords( if err != nil { return 0, err } - query = fmt.Sprintf("select * from %s", config.WatermarkTable) totalRecords := 0 onResult := func(rs *mysql.Result) error { @@ -179,18 +179,17 @@ func (c *MySqlConnector) PullQRepRecords( return nil } - /* - // testing - rs, err := c.Execute(ctx, "show databases") - if err != nil { - return 0, fmt.Errorf("mymymy err %w", err) - } - for rowIdx, row := range rs.Values { - for idx, val := range row { - c.logger.Info("mymymy show", slog.Int("rowIdx", rowIdx), slog.Int("idx", idx), slog.Any("field", string(val.AsString()))) - } + // testing + schema, _, _ := strings.Cut(config.WatermarkTable, ".") + rs, err := c.Execute(ctx, fmt.Sprintf("show tables from %s", schema)) + if err != nil { + return 0, fmt.Errorf("mymymy err %w", err) + } + for rowIdx, row := range rs.Values { + for idx, val := range row { + c.logger.Info("mymymy show", slog.Int("rowIdx", rowIdx), slog.Int("idx", idx), slog.Any("field", string(val.AsString()))) } - */ + } if last.FullTablePartition { // this is a full table partition, so just run the query From c5675d066a190e29dd685979717d4ccf60fec925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 21:43:57 +0000 Subject: [PATCH 49/80] log exits --- flow/connectors/mysql/qrep.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 83c35b1761..0e1c13f021 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -181,7 +181,7 @@ func (c *MySqlConnector) PullQRepRecords( // testing schema, _, _ := strings.Cut(config.WatermarkTable, ".") - rs, err := c.Execute(ctx, fmt.Sprintf("show tables from %s", schema)) + rs, err := c.Execute(ctx, "show tables from "+schema) if err != nil { return 0, fmt.Errorf("mymymy err %w", err) } @@ -195,6 +195,7 @@ func (c *MySqlConnector) PullQRepRecords( // this is a full table partition, so just run the query var rs mysql.Result if err := c.ExecuteSelectStreaming(ctx, query, &rs, onRow, onResult); err != nil { + c.logger.Error("mymymy full err", slog.Any("error", err)) return 0, err } } else { @@ -215,10 +216,13 @@ func (c *MySqlConnector) PullQRepRecords( var rs mysql.Result if err := c.ExecuteSelectStreaming(ctx, query, &rs, onRow, onResult, rangeStart, rangeEnd); err != nil { + c.logger.Error("mymymy partial err", slog.Any("error", err)) return 0, err } } + c.logger.Info("mymymy success") + close(stream.Records) return totalRecords, nil } From db8d7e2e31b270d1465ef510fe6ff4e158c42ff1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 21:58:00 +0000 Subject: [PATCH 50/80] did I misread ordering of these callbacks --- flow/connectors/mysql/qrep.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 0e1c13f021..d52191bd09 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -154,19 +154,23 @@ func (c *MySqlConnector) PullQRepRecords( totalRecords := 0 onResult := func(rs *mysql.Result) error { + c.logger.Info("result", slog.Any("rs", rs)) schema, err := QRecordSchemaFromMysqlFields(rs.Fields) if err != nil { return err } + c.logger.Info("set schema") stream.SetSchema(schema) return nil } onRow := func(row []mysql.FieldValue) error { totalRecords += 1 // TODO can this be batched in onResult or by checking rs at end? + c.logger.Info("getting schema") schema, err := stream.Schema() if err != nil { return err } + c.logger.Info("got schema") record := make([]qvalue.QValue, 0, len(row)) for idx, val := range row { qv, err := QValueFromMysqlFieldValue(schema.Fields[idx].Type, val) From f5cfddd985c11f9bbe2e7e02567dfe611d07cbf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 10 Jan 2025 22:17:06 +0000 Subject: [PATCH 51/80] less logging, crashed browser tab, but somehow we're seeing results in right order just streaming doesn't end until we database drop --- flow/connectors/mysql/mysql.go | 2 +- flow/connectors/mysql/qrep.go | 23 ++++++----------------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 0ece8487b0..123b8bc7a0 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -141,7 +141,7 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, if err != nil { if reconnects > 0 && mysql.ErrorEqual(err, mysql.ErrBadConn) { reconnects -= 1 - c.conn.Close() + _ = c.conn.Close() c.conn = nil continue } diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index d52191bd09..4517f07f5b 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "log/slog" - "strings" "text/template" "github.com/go-mysql-org/go-mysql/mysql" @@ -154,23 +153,23 @@ func (c *MySqlConnector) PullQRepRecords( totalRecords := 0 onResult := func(rs *mysql.Result) error { - c.logger.Info("result", slog.Any("rs", rs)) schema, err := QRecordSchemaFromMysqlFields(rs.Fields) if err != nil { return err } - c.logger.Info("set schema") + c.logger.Info("mymy set schema") stream.SetSchema(schema) return nil } onRow := func(row []mysql.FieldValue) error { totalRecords += 1 // TODO can this be batched in onResult or by checking rs at end? - c.logger.Info("getting schema") + c.logger.Info("mymy getting schema") schema, err := stream.Schema() if err != nil { + c.logger.Error("mymy error schema", slog.Any("error", err)) return err } - c.logger.Info("got schema") + c.logger.Info("mymy got schema") record := make([]qvalue.QValue, 0, len(row)) for idx, val := range row { qv, err := QValueFromMysqlFieldValue(schema.Fields[idx].Type, val) @@ -179,22 +178,12 @@ func (c *MySqlConnector) PullQRepRecords( } record = append(record, qv) } + c.logger.Info("mymy append record") stream.Records <- record + c.logger.Info("mymy appended record") return nil } - // testing - schema, _, _ := strings.Cut(config.WatermarkTable, ".") - rs, err := c.Execute(ctx, "show tables from "+schema) - if err != nil { - return 0, fmt.Errorf("mymymy err %w", err) - } - for rowIdx, row := range rs.Values { - for idx, val := range row { - c.logger.Info("mymymy show", slog.Int("rowIdx", rowIdx), slog.Int("idx", idx), slog.Any("field", string(val.AsString()))) - } - } - if last.FullTablePartition { // this is a full table partition, so just run the query var rs mysql.Result From ff1bb1d847d80ca7b6de56a4c99e5dde8f77b24e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Sat, 11 Jan 2025 00:55:37 +0000 Subject: [PATCH 52/80] I'm an idiot --- flow/connectors/mysql/mysql.go | 3 +-- flow/connectors/mysql/qrep.go | 10 ---------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 123b8bc7a0..5b8e96e2ef 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -72,7 +72,6 @@ func (c *MySqlConnector) connect(ctx context.Context) (*client.Conn, error) { } func (c *MySqlConnector) Execute(ctx context.Context, cmd string, args ...interface{}) (*mysql.Result, error) { - slog.Info("mymymy", slog.String("query", cmd), slog.Any("when", time.Now())) reconnects := 3 for { // TODO need new connection if ctx changes between calls, or make upstream PR @@ -103,7 +102,6 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, resultCb client.SelectPerResultCallback, args ...interface{}, ) error { - slog.Info("mymymy stream", slog.String("query", cmd), slog.Any("when", time.Now())) reconnects := 3 for { // TODO need new connection if ctx changes between calls, or make upstream PR @@ -148,6 +146,7 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, return err } } + return nil } } diff --git a/flow/connectors/mysql/qrep.go b/flow/connectors/mysql/qrep.go index 4517f07f5b..437625be94 100644 --- a/flow/connectors/mysql/qrep.go +++ b/flow/connectors/mysql/qrep.go @@ -157,19 +157,15 @@ func (c *MySqlConnector) PullQRepRecords( if err != nil { return err } - c.logger.Info("mymy set schema") stream.SetSchema(schema) return nil } onRow := func(row []mysql.FieldValue) error { totalRecords += 1 // TODO can this be batched in onResult or by checking rs at end? - c.logger.Info("mymy getting schema") schema, err := stream.Schema() if err != nil { - c.logger.Error("mymy error schema", slog.Any("error", err)) return err } - c.logger.Info("mymy got schema") record := make([]qvalue.QValue, 0, len(row)) for idx, val := range row { qv, err := QValueFromMysqlFieldValue(schema.Fields[idx].Type, val) @@ -178,9 +174,7 @@ func (c *MySqlConnector) PullQRepRecords( } record = append(record, qv) } - c.logger.Info("mymy append record") stream.Records <- record - c.logger.Info("mymy appended record") return nil } @@ -188,7 +182,6 @@ func (c *MySqlConnector) PullQRepRecords( // this is a full table partition, so just run the query var rs mysql.Result if err := c.ExecuteSelectStreaming(ctx, query, &rs, onRow, onResult); err != nil { - c.logger.Error("mymymy full err", slog.Any("error", err)) return 0, err } } else { @@ -209,13 +202,10 @@ func (c *MySqlConnector) PullQRepRecords( var rs mysql.Result if err := c.ExecuteSelectStreaming(ctx, query, &rs, onRow, onResult, rangeStart, rangeEnd); err != nil { - c.logger.Error("mymymy partial err", slog.Any("error", err)) return 0, err } } - c.logger.Info("mymymy success") - close(stream.Records) return totalRecords, nil } From 648ba2adf9df25eb2640cfa2e07befc740919266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Sat, 11 Jan 2025 14:41:42 +0000 Subject: [PATCH 53/80] convert couple more tests to pg/mysql, remove Genericness --- flow/e2e/bigquery/bigquery.go | 4 + flow/e2e/clickhouse/clickhouse.go | 4 + flow/e2e/clickhouse/peer_flow_ch_test.go | 163 ++++++++---------- flow/e2e/congen.go | 2 +- flow/e2e/elasticsearch/elasticsearch.go | 4 + flow/e2e/eventhub/peer_flow_eh_test.go | 4 + flow/e2e/generic/generic_test.go | 30 ++-- flow/e2e/kafka/kafka_test.go | 4 + flow/e2e/pg.go | 2 +- flow/e2e/postgres/postgres.go | 4 + flow/e2e/pubsub/pubsub_test.go | 4 + flow/e2e/s3/qrep_flow_s3_test.go | 4 + flow/e2e/snowflake/snowflake.go | 4 + .../e2e/sqlserver/qrep_flow_sqlserver_test.go | 4 + flow/e2e/test_utils.go | 45 ++--- 15 files changed, 154 insertions(+), 128 deletions(-) diff --git a/flow/e2e/bigquery/bigquery.go b/flow/e2e/bigquery/bigquery.go index 98d8e79b94..fe70949927 100644 --- a/flow/e2e/bigquery/bigquery.go +++ b/flow/e2e/bigquery/bigquery.go @@ -36,6 +36,10 @@ func (s PeerFlowE2ETestSuiteBQ) Connector() *connpostgres.PostgresConnector { return s.conn } +func (s PeerFlowE2ETestSuiteBQ) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s PeerFlowE2ETestSuiteBQ) DestinationConnector() connectors.Connector { // TODO have BQ connector return nil diff --git a/flow/e2e/clickhouse/clickhouse.go b/flow/e2e/clickhouse/clickhouse.go index b89fa6e052..e8a43ed5e4 100644 --- a/flow/e2e/clickhouse/clickhouse.go +++ b/flow/e2e/clickhouse/clickhouse.go @@ -42,6 +42,10 @@ func (s ClickHouseSuite) Connector() *connpostgres.PostgresConnector { return c } +func (s ClickHouseSuite) Source() e2e.SuiteSource { + return s.source +} + func (s ClickHouseSuite) DestinationConnector() connectors.Connector { // TODO have CH connector return nil diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index 942366563f..f95e76c454 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/require" connclickhouse "github.com/PeerDB-io/peerdb/flow/connectors/clickhouse" + connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/e2e" "github.com/PeerDB-io/peerdb/flow/e2eshared" "github.com/PeerDB-io/peerdb/flow/generated/protos" @@ -50,21 +51,19 @@ func (s ClickHouseSuite) Test_Addition_Removal() { dstTableName := "test_table_add_remove_target" addedDstTableName := "test_table_add_remove_target_added" - _, err := s.Conn().Exec(context.Background(), fmt.Sprintf(` + require.NoError(s.t, s.source.Exec(fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id SERIAL PRIMARY KEY, - key TEXT NOT NULL + "key" TEXT NOT NULL ); - `, srcTableName)) - require.NoError(s.t, err) + `, srcTableName))) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - CREATE TABLE IF NOT EXISTS %s ( - id SERIAL PRIMARY KEY, - key TEXT NOT NULL - ); - `, addedSrcTableName)) - require.NoError(s.t, err) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(` + CREATE TABLE IF NOT EXISTS %s ( + id SERIAL PRIMARY KEY, + "key" TEXT NOT NULL + ); + `, addedSrcTableName))) connectionGen := e2e.FlowConnectionGenerationConfig{ FlowJobName: s.attachSuffix("clickhousetableremoval"), @@ -81,38 +80,36 @@ func (s ClickHouseSuite) Test_Addition_Removal() { var flowStatus protos.FlowStatus val, err := env.Query(shared.FlowStatusQuery) e2e.EnvNoError(s.t, env, err) - err = val.Get(&flowStatus) - e2e.EnvNoError(s.t, env, err) + e2e.EnvNoError(s.t, env, val.Get(&flowStatus)) return flowStatus } e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s (key) VALUES ('test'); - `, srcTableName)) - require.NoError(s.t, err) - e2e.EnvWaitForEqualTablesWithNames(env, s, "first insert", "test_table_add_remove", dstTableName, "id,key") + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`INSERT INTO %s ("key") VALUES ('test')`, srcTableName))) + e2e.EnvWaitForEqualTablesWithNames(env, s, "first insert", "test_table_add_remove", dstTableName, "id,\"key\"") e2e.SignalWorkflow(env, model.FlowSignal, model.PauseSignal) e2e.EnvWaitFor(s.t, env, 4*time.Minute, "pausing for add table", func() bool { flowStatus := getFlowStatus() return flowStatus == protos.FlowStatus_STATUS_PAUSED }) - _, err = s.Conn().Exec(context.Background(), - `SELECT pg_terminate_backend(pid) FROM pg_stat_activity - WHERE query LIKE '%START_REPLICATION%' AND query LIKE '%clickhousetableremoval%' AND backend_type='walsender'`) - require.NoError(s.t, err) - - e2e.EnvWaitFor(s.t, env, 3*time.Minute, "waiting for replication to stop", func() bool { - rows, err := s.Conn().Query(context.Background(), ` - SELECT pid FROM pg_stat_activity - WHERE query LIKE '%START_REPLICATION%' AND query LIKE '%clickhousetableremoval%' AND backend_type='walsender' - `) + if pgconn, ok := s.source.Connector().(*connpostgres.PostgresConnector); ok { + conn := pgconn.Conn() + _, err := conn.Exec(context.Background(), + `SELECT pg_terminate_backend(pid) FROM pg_stat_activity + WHERE query LIKE '%START_REPLICATION%' AND query LIKE '%clickhousetableremoval%' AND backend_type='walsender'`) require.NoError(s.t, err) - defer rows.Close() - return !rows.Next() - }) + + e2e.EnvWaitFor(s.t, env, 3*time.Minute, "waiting for replication to stop", func() bool { + rows, err := conn.Query(context.Background(), + `SELECT pid FROM pg_stat_activity + WHERE query LIKE '%START_REPLICATION%' AND query LIKE '%clickhousetableremoval%' AND backend_type='walsender'`) + require.NoError(s.t, err) + defer rows.Close() + return !rows.Next() + }) + } runID := e2e.EnvGetRunID(s.t, env) e2e.SignalWorkflow(env, model.CDCDynamicPropertiesSignal, &protos.CDCFlowConfigUpdate{ @@ -131,31 +128,30 @@ func (s ClickHouseSuite) Test_Addition_Removal() { afterAddRunID := e2e.EnvGetRunID(s.t, env) require.NotEqual(s.t, runID, afterAddRunID) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s (key) VALUES ('test'); - `, addedSrcTableName)) - require.NoError(s.t, err) - e2e.EnvWaitForEqualTablesWithNames(env, s, "first insert to added table", "test_table_add_remove_added", addedDstTableName, "id,key") + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`INSERT INTO %s ("key") VALUES ('test')`, addedSrcTableName))) + e2e.EnvWaitForEqualTablesWithNames(env, s, "first insert to added table", "test_table_add_remove_added", addedDstTableName, "id,\"key\"") e2e.SignalWorkflow(env, model.FlowSignal, model.PauseSignal) e2e.EnvWaitFor(s.t, env, 3*time.Minute, "pausing again for removing table", func() bool { flowStatus := getFlowStatus() return flowStatus == protos.FlowStatus_STATUS_PAUSED }) - _, err = s.Conn().Exec(context.Background(), - `SELECT pg_terminate_backend(pid) FROM pg_stat_activity - WHERE query LIKE '%START_REPLICATION%' AND query LIKE '%clickhousetableremoval%' AND backend_type='walsender'`) - require.NoError(s.t, err) - - e2e.EnvWaitFor(s.t, env, 3*time.Minute, "waiting for replication to stop", func() bool { - rows, err := s.Conn().Query(context.Background(), ` - SELECT pid FROM pg_stat_activity - WHERE query LIKE '%START_REPLICATION%' AND query LIKE '%clickhousetableremoval%' AND backend_type='walsender' - `) + if pgconn, ok := s.source.Connector().(*connpostgres.PostgresConnector); ok { + conn := pgconn.Conn() + _, err := conn.Exec(context.Background(), + `SELECT pg_terminate_backend(pid) FROM pg_stat_activity + WHERE query LIKE '%START_REPLICATION%' AND query LIKE '%clickhousetableremoval%' AND backend_type='walsender'`) require.NoError(s.t, err) - defer rows.Close() - return !rows.Next() - }) + + e2e.EnvWaitFor(s.t, env, 3*time.Minute, "waiting for replication to stop", func() bool { + rows, err := conn.Query(context.Background(), + `SELECT pid FROM pg_stat_activity + WHERE query LIKE '%START_REPLICATION%' AND query LIKE '%clickhousetableremoval%' AND backend_type='walsender'`) + require.NoError(s.t, err) + defer rows.Close() + return !rows.Next() + }) + } e2e.SignalWorkflow(env, model.CDCDynamicPropertiesSignal, &protos.CDCFlowConfigUpdate{ RemovedTables: []*protos.TableMapping{ @@ -173,13 +169,10 @@ func (s ClickHouseSuite) Test_Addition_Removal() { afterRemoveRunID := e2e.EnvGetRunID(s.t, env) require.NotEqual(s.t, runID, afterRemoveRunID) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf("INSERT INTO %s (key) VALUES ('test')", srcTableName)) - require.NoError(s.t, err) - - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf("INSERT INTO %s (key) VALUES ('test')", addedSrcTableName)) - require.NoError(s.t, err) + require.NoError(s.t, s.source.Exec(fmt.Sprintf("INSERT INTO %s (key) VALUES ('test')", srcTableName))) + require.NoError(s.t, s.source.Exec(fmt.Sprintf("INSERT INTO %s (key) VALUES ('test')", addedSrcTableName))) - e2e.EnvWaitForEqualTablesWithNames(env, s, "second insert to added table", "test_table_add_remove_added", addedDstTableName, "id,key") + e2e.EnvWaitForEqualTablesWithNames(env, s, "second insert to added table", "test_table_add_remove_added", addedDstTableName, "id,\"key\"") rows, err := s.GetRows(dstTableName, "id") require.NoError(s.t, err) @@ -222,14 +215,14 @@ func (s ClickHouseSuite) Test_NullableMirrorSetting() { env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key,val,n,t") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\",val,n,t") _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` INSERT INTO %s (key) VALUES ('cdc'); `, srcFullName)) require.NoError(s.t, err) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,key,val,n,t") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,\"key\",val,n,t") env.Cancel() e2e.RequireEnvCanceled(s.t, env) @@ -276,14 +269,14 @@ func (s ClickHouseSuite) Test_NullableColumnSetting() { env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key,val,n,t") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\",val,n,t") _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` INSERT INTO %s (key) VALUES ('cdc'); `, srcFullName)) require.NoError(s.t, err) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,key,val,n,t") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,\"key\",val,n,t") env.Cancel() e2e.RequireEnvCanceled(s.t, env) @@ -319,13 +312,13 @@ func (s ClickHouseSuite) Test_Date32() { env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key,d") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\",d") require.NoError(s.t, s.source.Exec( fmt.Sprintf(`INSERT INTO %s ("key",d) VALUES ('cdc','1935-01-01')`, quotedSrcFullName), )) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,key,d") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,\"key\",d") env.Cancel() e2e.RequireEnvCanceled(s.t, env) @@ -336,18 +329,14 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Disabled() { srcFullName := s.attachSchemaSuffix("test_update_pkey_disabled") dstTableName := "test_update_pkey_disabled_dst" - _, err := s.Conn().Exec(context.Background(), fmt.Sprintf(` + require.NoError(s.t, s.source.Exec(fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id INT PRIMARY KEY, - key TEXT NOT NULL + "key" TEXT NOT NULL ); - `, srcFullName)) - require.NoError(s.t, err) + `, srcFullName))) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s (id,key) VALUES (1,'init'); - `, srcFullName)) - require.NoError(s.t, err) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`INSERT INTO %s (id,"key") VALUES (1,'init')`, srcFullName))) connectionGen := e2e.FlowConnectionGenerationConfig{ FlowJobName: s.attachSuffix("clickhouse_pkey_update_disabled"), @@ -362,12 +351,9 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Disabled() { env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\"") - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - UPDATE %s SET id = 2, key = 'update' WHERE id = 1; - `, srcFullName)) - require.NoError(s.t, err) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`UPDATE %s SET id = 2, "key" = 'update' WHERE id = 1`, srcFullName))) e2e.EnvWaitFor(s.t, env, time.Minute, "waiting for duplicate row", func() bool { rows, err := s.GetRows(dstTableName, "id") @@ -384,18 +370,14 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Enabled() { srcFullName := s.attachSchemaSuffix("test_update_pkey_enabled") dstTableName := "test_update_pkey_enabled_dst" - _, err := s.Conn().Exec(context.Background(), fmt.Sprintf(` + require.NoError(s.t, s.source.Exec(fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id INT PRIMARY KEY, - key TEXT NOT NULL + "key" TEXT NOT NULL ); - `, srcFullName)) - require.NoError(s.t, err) + `, srcFullName))) - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s (id,key) VALUES (1,'init'); - `, srcFullName)) - require.NoError(s.t, err) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`INSERT INTO %s (id,"key") VALUES (1,'init')`, srcFullName))) connectionGen := e2e.FlowConnectionGenerationConfig{ FlowJobName: s.attachSuffix("clickhouse_pkey_update_enabled"), @@ -410,14 +392,11 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Enabled() { env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\"") - _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` - UPDATE %s SET id = 2, key = 'update' WHERE id = 1; - `, srcFullName)) - require.NoError(s.t, err) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`UPDATE %s SET id = 2, key = 'update' WHERE id = 1`, srcFullName))) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,key") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,\"key\"") env.Cancel() e2e.RequireEnvCanceled(s.t, env) @@ -493,12 +472,12 @@ func (s ClickHouseSuite) WeirdTable(tableName string) { env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\"") _, err = s.Conn().Exec(context.Background(), fmt.Sprintf("INSERT INTO %s (key) VALUES ('cdc')", srcFullName)) require.NoError(s.t, err) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,key") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,\"key\"") env.Cancel() e2e.RequireEnvCanceled(s.t, env) @@ -517,7 +496,7 @@ func (s ClickHouseSuite) WeirdTable(tableName string) { flowConnConfig.Resync = true env = e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\"") env.Cancel() e2e.RequireEnvCanceled(s.t, env) @@ -534,7 +513,7 @@ func (s ClickHouseSuite) WeirdTable(tableName string) { require.NoError(s.t, ch.Close()) env = e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) e2e.SetupCDCFlowStatusQuery(s.t, env, flowConnConfig) - e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,key") + e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\"") env.Cancel() e2e.RequireEnvCanceled(s.t, env) } diff --git a/flow/e2e/congen.go b/flow/e2e/congen.go index 7449281bb3..6898414c7a 100644 --- a/flow/e2e/congen.go +++ b/flow/e2e/congen.go @@ -19,7 +19,7 @@ type SuiteSource interface { Exec(sql string) error } -func TableMappings[TSource connectors.Connector](s GenericSuite[TSource], tables ...string) []*protos.TableMapping { +func TableMappings(s GenericSuite, tables ...string) []*protos.TableMapping { if len(tables)&1 != 0 { panic("must receive even number of table names") } diff --git a/flow/e2e/elasticsearch/elasticsearch.go b/flow/e2e/elasticsearch/elasticsearch.go index c8b1645bbe..3a9a400061 100644 --- a/flow/e2e/elasticsearch/elasticsearch.go +++ b/flow/e2e/elasticsearch/elasticsearch.go @@ -32,6 +32,10 @@ func (s elasticsearchSuite) Connector() *connpostgres.PostgresConnector { return s.conn } +func (s elasticsearchSuite) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s elasticsearchSuite) Suffix() string { return s.suffix } diff --git a/flow/e2e/eventhub/peer_flow_eh_test.go b/flow/e2e/eventhub/peer_flow_eh_test.go index 33a46ecb7f..67a2c09a6c 100644 --- a/flow/e2e/eventhub/peer_flow_eh_test.go +++ b/flow/e2e/eventhub/peer_flow_eh_test.go @@ -36,6 +36,10 @@ func (s EventhubsSuite) Connector() *connpostgres.PostgresConnector { return s.conn } +func (s EventhubsSuite) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s EventhubsSuite) Conn() *pgx.Conn { return s.Connector().Conn() } diff --git a/flow/e2e/generic/generic_test.go b/flow/e2e/generic/generic_test.go index b6c794c581..9159787d2f 100644 --- a/flow/e2e/generic/generic_test.go +++ b/flow/e2e/generic/generic_test.go @@ -32,15 +32,19 @@ func TestGenericBQ(t *testing.T) { e2eshared.RunSuite(t, SetupGenericSuite(e2e_bigquery.SetupSuite)) } -func TestGenericCH(t *testing.T) { +func TestGenericCH_PG(t *testing.T) { e2eshared.RunSuite(t, SetupGenericSuite(e2e_clickhouse.SetupSuite(t, e2e.SetupPostgres))) } +func TestGenericCH_MySQL(t *testing.T) { + e2eshared.RunSuite(t, SetupGenericSuite(e2e_clickhouse.SetupSuite(t, e2e.SetupMySQL))) +} + type Generic struct { - e2e.GenericSuite[*connpostgres.PostgresConnector] + e2e.GenericSuite } -func SetupGenericSuite[T e2e.GenericSuite[*connpostgres.PostgresConnector]](f func(t *testing.T) T) func(t *testing.T) Generic { +func SetupGenericSuite[T e2e.GenericSuite](f func(t *testing.T) T) func(t *testing.T) Generic { return func(t *testing.T) Generic { t.Helper() return Generic{f(t)} @@ -52,16 +56,19 @@ func (s Generic) Test_Simple_Flow() { srcTable := "test_simple" dstTable := "test_simple_dst" srcSchemaTable := e2e.AttachSchema(s, srcTable) + hstoreType := "TEXT" + if _, isPg := s.Source().Connector().(*connpostgres.PostgresConnector); isPg { + hstoreType = "HSTORE" + } - _, err := s.Connector().Conn().Exec(context.Background(), fmt.Sprintf(` + require.NoError(t, s.Source().Exec(fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id SERIAL PRIMARY KEY, - key TEXT NOT NULL, + "key" TEXT NOT NULL, value TEXT NOT NULL, - myh HSTORE NOT NULL + myh %s NOT NULL ); - `, srcSchemaTable)) - require.NoError(t, err) + `, srcSchemaTable, hstoreType))) connectionGen := e2e.FlowConnectionGenerationConfig{ FlowJobName: e2e.AddSuffix(s, "test_simple"), @@ -78,10 +85,9 @@ func (s Generic) Test_Simple_Flow() { for i := range 10 { testKey := fmt.Sprintf("test_key_%d", i) testValue := fmt.Sprintf("test_value_%d", i) - _, err = s.Connector().Conn().Exec(context.Background(), fmt.Sprintf(` - INSERT INTO %s(key, value, myh) VALUES ($1, $2, '"a"=>"b"') - `, srcSchemaTable), testKey, testValue) - e2e.EnvNoError(t, env, err) + e2e.EnvNoError(t, env, s.Source().Exec( + fmt.Sprintf(`INSERT INTO %s("key", value, myh) VALUES ('%s', '%s', '"a"=>"b"')`, srcSchemaTable, testKey, testValue), + )) } t.Log("Inserted 10 rows into the source table") diff --git a/flow/e2e/kafka/kafka_test.go b/flow/e2e/kafka/kafka_test.go index 37b7f0d177..041d62fdad 100644 --- a/flow/e2e/kafka/kafka_test.go +++ b/flow/e2e/kafka/kafka_test.go @@ -33,6 +33,10 @@ func (s KafkaSuite) Connector() *connpostgres.PostgresConnector { return s.conn } +func (s KafkaSuite) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s KafkaSuite) Conn() *pgx.Conn { return s.Connector().Conn() } diff --git a/flow/e2e/pg.go b/flow/e2e/pg.go index 8f1146d51b..5237d4ea72 100644 --- a/flow/e2e/pg.go +++ b/flow/e2e/pg.go @@ -147,7 +147,7 @@ func (s *PostgresSource) Teardown(t *testing.T, suffix string) { } } -func TearDownPostgres(s Suite[*connpostgres.PostgresConnector]) { +func TearDownPostgres(s Suite) { t := s.T() t.Helper() diff --git a/flow/e2e/postgres/postgres.go b/flow/e2e/postgres/postgres.go index 7bc32a3be8..82e3025f59 100644 --- a/flow/e2e/postgres/postgres.go +++ b/flow/e2e/postgres/postgres.go @@ -32,6 +32,10 @@ func (s PeerFlowE2ETestSuitePG) Connector() *connpostgres.PostgresConnector { return s.conn } +func (s PeerFlowE2ETestSuitePG) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s PeerFlowE2ETestSuitePG) DestinationConnector() connectors.Connector { return s.conn } diff --git a/flow/e2e/pubsub/pubsub_test.go b/flow/e2e/pubsub/pubsub_test.go index 742f403e3c..258b0ec296 100644 --- a/flow/e2e/pubsub/pubsub_test.go +++ b/flow/e2e/pubsub/pubsub_test.go @@ -38,6 +38,10 @@ func (s PubSubSuite) Connector() *connpostgres.PostgresConnector { return s.conn } +func (s PubSubSuite) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s PubSubSuite) Conn() *pgx.Conn { return s.Connector().Conn() } diff --git a/flow/e2e/s3/qrep_flow_s3_test.go b/flow/e2e/s3/qrep_flow_s3_test.go index e8f49c3d2e..4bdc8d584f 100644 --- a/flow/e2e/s3/qrep_flow_s3_test.go +++ b/flow/e2e/s3/qrep_flow_s3_test.go @@ -32,6 +32,10 @@ func (s PeerFlowE2ETestSuiteS3) Connector() *connpostgres.PostgresConnector { return s.conn } +func (s PeerFlowE2ETestSuiteS3) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s PeerFlowE2ETestSuiteS3) Suffix() string { return s.suffix } diff --git a/flow/e2e/snowflake/snowflake.go b/flow/e2e/snowflake/snowflake.go index 8a741852b7..5c107d5852 100644 --- a/flow/e2e/snowflake/snowflake.go +++ b/flow/e2e/snowflake/snowflake.go @@ -36,6 +36,10 @@ func (s PeerFlowE2ETestSuiteSF) Connector() *connpostgres.PostgresConnector { return s.conn } +func (s PeerFlowE2ETestSuiteSF) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s PeerFlowE2ETestSuiteSF) DestinationConnector() connectors.Connector { return s.connector } diff --git a/flow/e2e/sqlserver/qrep_flow_sqlserver_test.go b/flow/e2e/sqlserver/qrep_flow_sqlserver_test.go index 230a092dc2..52bc2fa18e 100644 --- a/flow/e2e/sqlserver/qrep_flow_sqlserver_test.go +++ b/flow/e2e/sqlserver/qrep_flow_sqlserver_test.go @@ -41,6 +41,10 @@ func (s PeerFlowE2ETestSuiteSQLServer) Connector() *connpostgres.PostgresConnect return s.conn } +func (s PeerFlowE2ETestSuiteSQLServer) Source() e2e.SuiteSource { + return &e2e.PostgresSource{PostgresConnector: s.conn} +} + func (s PeerFlowE2ETestSuiteSQLServer) Suffix() string { return s.suffix } diff --git a/flow/e2e/test_utils.go b/flow/e2e/test_utils.go index 17bcd904b9..545b80a80c 100644 --- a/flow/e2e/test_utils.go +++ b/flow/e2e/test_utils.go @@ -40,30 +40,31 @@ func init() { _ = godotenv.Load() } -type Suite[TSource connectors.Connector] interface { +type Suite interface { e2eshared.Suite T() *testing.T - Connector() TSource + Connector() *connpostgres.PostgresConnector Suffix() string + Source() SuiteSource } -type RowSource[TSource connectors.Connector] interface { - Suite[TSource] +type RowSource interface { + Suite GetRows(table, cols string) (*model.QRecordBatch, error) } -type GenericSuite[TSource connectors.Connector] interface { - RowSource[TSource] +type GenericSuite interface { + RowSource Peer() *protos.Peer DestinationConnector() connectors.Connector DestinationTable(table string) string } -func AttachSchema(s interface{ Suffix() string }, table string) string { +func AttachSchema(s Suite, table string) string { return fmt.Sprintf("e2e_test_%s.%s", s.Suffix(), table) } -func AddSuffix[T connectors.Connector](s Suite[T], str string) string { +func AddSuffix(s Suite, str string) string { return fmt.Sprintf("%s_%s", str, s.Suffix()) } @@ -135,9 +136,9 @@ func GetMySqlRows(conn *connmysql.MySqlConnector, suffix string, table string, c return batch, nil } -func GetSuiteSourceRows[TSource connectors.Connector](suite Suite[TSource], table string, cols string) (*model.QRecordBatch, error) { +func GetSuiteSourceRows(suite Suite, table string, cols string) (*model.QRecordBatch, error) { // TODO move to SuiteSource - switch conn := any(suite.Connector()).(type) { + switch conn := any(suite.Source().Connector()).(type) { case *connpostgres.PostgresConnector: return GetPgRows(conn, suite.Suffix(), table, cols) case *connmysql.MySqlConnector: @@ -147,7 +148,7 @@ func GetSuiteSourceRows[TSource connectors.Connector](suite Suite[TSource], tabl } } -func RequireEqualTables[TSource connectors.Connector](suite RowSource[TSource], table string, cols string) { +func RequireEqualTables(suite RowSource, table string, cols string) { t := suite.T() t.Helper() @@ -160,12 +161,12 @@ func RequireEqualTables[TSource connectors.Connector](suite RowSource[TSource], require.True(t, e2eshared.CheckEqualRecordBatches(t, sourceRows, rows)) } -func EnvEqualTables[TSource connectors.Connector](env WorkflowRun, suite RowSource[TSource], table string, cols string) { +func EnvEqualTables[TSource connectors.Connector](env WorkflowRun, suite RowSource, table string, cols string) { EnvEqualTablesWithNames(env, suite, table, table, cols) } -func EnvEqualTablesWithNames[TSource connectors.Connector]( - env WorkflowRun, suite RowSource[TSource], srcTable string, dstTable string, cols string, +func EnvEqualTablesWithNames( + env WorkflowRun, suite RowSource, srcTable string, dstTable string, cols string, ) { t := suite.T() t.Helper() @@ -179,9 +180,9 @@ func EnvEqualTablesWithNames[TSource connectors.Connector]( EnvEqualRecordBatches(t, env, sourceRows, rows) } -func EnvWaitForEqualTables[TSource connectors.Connector]( +func EnvWaitForEqualTables( env WorkflowRun, - suite RowSource[TSource], + suite RowSource, reason string, table string, cols string, @@ -190,9 +191,9 @@ func EnvWaitForEqualTables[TSource connectors.Connector]( EnvWaitForEqualTablesWithNames(env, suite, reason, table, table, cols) } -func EnvWaitForEqualTablesWithNames[TSource connectors.Connector]( +func EnvWaitForEqualTablesWithNames( env WorkflowRun, - suite RowSource[TSource], + suite RowSource, reason string, srcTable string, dstTable string, @@ -220,9 +221,9 @@ func EnvWaitForEqualTablesWithNames[TSource connectors.Connector]( }) } -func EnvWaitForCount[TSource connectors.Connector]( +func EnvWaitForCount( env WorkflowRun, - suite RowSource[TSource], + suite RowSource, reason string, dstTable string, cols string, @@ -591,7 +592,7 @@ func GetOwnersSelectorStringsSF() [2]string { return [2]string{strings.Join(pgFields, ","), strings.Join(sfFields, ",")} } -func ExpectedDestinationIdentifier[T connectors.Connector](s GenericSuite[T], ident string) string { +func ExpectedDestinationIdentifier(s GenericSuite, ident string) string { switch s.DestinationConnector().(type) { case *connsnowflake.SnowflakeConnector: return strings.ToUpper(ident) @@ -600,7 +601,7 @@ func ExpectedDestinationIdentifier[T connectors.Connector](s GenericSuite[T], id } } -func ExpectedDestinationTableName[T connectors.Connector](s GenericSuite[T], table string) string { +func ExpectedDestinationTableName(s GenericSuite, table string) string { return ExpectedDestinationIdentifier(s, s.DestinationTable(table)) } From 31ab468cfd31ddbb9f9a19e171ecbc540a8f8c15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Mon, 13 Jan 2025 20:24:26 +0000 Subject: [PATCH 54/80] map blob to string since clickhouse string is binary target anyways --- flow/connectors/mysql/mysql.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 5b8e96e2ef..910bae42c7 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -265,13 +265,13 @@ func qkindFromMysql(ty uint8) (qvalue.QValueKind, error) { case mysql.MYSQL_TYPE_SET: return qvalue.QValueKindInt64, nil case mysql.MYSQL_TYPE_TINY_BLOB: - return qvalue.QValueKindBytes, nil + return qvalue.QValueKindString, nil case mysql.MYSQL_TYPE_MEDIUM_BLOB: - return qvalue.QValueKindBytes, nil + return qvalue.QValueKindString, nil case mysql.MYSQL_TYPE_LONG_BLOB: - return qvalue.QValueKindBytes, nil + return qvalue.QValueKindString, nil case mysql.MYSQL_TYPE_BLOB: - return qvalue.QValueKindBytes, nil + return qvalue.QValueKindString, nil case mysql.MYSQL_TYPE_VAR_STRING: return qvalue.QValueKindString, nil case mysql.MYSQL_TYPE_STRING: From 09b8eca82504c84b6a0900f09a238debdf20df7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Mon, 13 Jan 2025 20:56:41 +0000 Subject: [PATCH 55/80] gtid_mode is mysql specific --- flow/connectors/mysql/mysql.go | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 910bae42c7..635c9c1b83 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -151,17 +151,22 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, } func (c *MySqlConnector) GetGtidModeOn(ctx context.Context) (bool, error) { - rr, err := c.Execute(ctx, "select @@global.gtid_mode") - if err != nil { - return false, err - } + if c.config.Flavor == mysql.MySQLFlavor { + rr, err := c.Execute(ctx, "select @@global.gtid_mode") + if err != nil { + return false, err + } - gtid_mode, err := rr.GetString(0, 0) - if err != nil { - return false, err - } + gtid_mode, err := rr.GetString(0, 0) + if err != nil { + return false, err + } - return gtid_mode == "ON", nil + return gtid_mode == "ON", nil + } else { + // TODO how do we choose with mariadb? + return true, nil + } } func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, error) { From 19b25181b4a80c71817537d830d3c03641e9a9af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Mon, 13 Jan 2025 21:19:12 +0000 Subject: [PATCH 56/80] fix update --- flow/e2e/clickhouse/peer_flow_ch_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index f95e76c454..c058397311 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -353,7 +353,7 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Disabled() { e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\"") - require.NoError(s.t, s.source.Exec(fmt.Sprintf(`UPDATE %s SET id = 2, "key" = 'update' WHERE id = 1`, srcFullName))) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`UPDATE %s SET id=2, "key"='update' WHERE id=1`, srcFullName))) e2e.EnvWaitFor(s.t, env, time.Minute, "waiting for duplicate row", func() bool { rows, err := s.GetRows(dstTableName, "id") @@ -394,7 +394,7 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Enabled() { e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on initial", srcTableName, dstTableName, "id,\"key\"") - require.NoError(s.t, s.source.Exec(fmt.Sprintf(`UPDATE %s SET id = 2, key = 'update' WHERE id = 1`, srcFullName))) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`UPDATE %s SET id=2, "key"='update' WHERE id=1`, srcFullName))) e2e.EnvWaitForEqualTablesWithNames(env, s, "waiting on cdc", srcTableName, dstTableName, "id,\"key\"") From 6261fdefa548c76d74fc39730dd1bce0e91f306e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Mon, 13 Jan 2025 22:58:23 +0000 Subject: [PATCH 57/80] try mysql over maria --- .github/workflows/flow.yml | 4 ++-- flow/e2e/mysql.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/flow.yml b/.github/workflows/flow.yml index 2251082ab1..0c752221c0 100644 --- a/.github/workflows/flow.yml +++ b/.github/workflows/flow.yml @@ -25,11 +25,11 @@ jobs: POSTGRES_DB: postgres POSTGRES_INITDB_ARGS: --locale=C.UTF-8 mysql: - image: mariadb:lts-ubi + image: mysql:oracle ports: - 3306:3306 env: - MARIADB_ROOT_PASSWORD: maria + MYSQL_ROOT_PASSWORD: cipass redpanda: image: redpandadata/redpanda@sha256:7214ddaf8426d25936459cf77c1f905566a4483a97d2b13006120dcd98a5c846 ports: diff --git a/flow/e2e/mysql.go b/flow/e2e/mysql.go index 09b28d1676..2f37ae10e8 100644 --- a/flow/e2e/mysql.go +++ b/flow/e2e/mysql.go @@ -18,12 +18,12 @@ var mysqlConfig = &protos.MySqlConfig{ Host: "localhost", Port: 3306, User: "root", - Password: "maria", + Password: "cipass", Database: "", Setup: nil, Compression: 0, DisableTls: true, - Flavor: "mariadb", + Flavor: "mysql", } func SetupMySQL(t *testing.T, suffix string) (*MySqlSource, error) { From 126c8c48659a2adc4f94f882352ca9a69c5369e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Mon, 13 Jan 2025 23:37:39 +0000 Subject: [PATCH 58/80] more quoted key --- flow/e2e/generic/generic_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/e2e/generic/generic_test.go b/flow/e2e/generic/generic_test.go index 9159787d2f..193e4c92f7 100644 --- a/flow/e2e/generic/generic_test.go +++ b/flow/e2e/generic/generic_test.go @@ -91,7 +91,7 @@ func (s Generic) Test_Simple_Flow() { } t.Log("Inserted 10 rows into the source table") - e2e.EnvWaitForEqualTablesWithNames(env, s, "normalizing 10 rows", srcTable, dstTable, `id,key,value,myh`) + e2e.EnvWaitForEqualTablesWithNames(env, s, "normalizing 10 rows", srcTable, dstTable, `id,"key",value,myh`) env.Cancel() e2e.RequireEnvCanceled(t, env) } From bdb6a3e4fb427cb648d19582406b0db5d061a4d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 14 Jan 2025 04:00:28 +0000 Subject: [PATCH 59/80] cleanup while preparing demo --- flow/connectors/mysql/cdc.go | 184 ++++++++++++++++++--------------- flow/connectors/mysql/mysql.go | 8 +- 2 files changed, 107 insertions(+), 85 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index f541ac933d..0b65760f13 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -322,116 +322,134 @@ func (c *MySqlConnector) PullRecords( sourceTableName := string(ev.Table.Schema) + "." + string(ev.Table.Table) // TODO this is fragile destinationTableName := req.TableNameMapping[sourceTableName].Name schema := req.TableNameSchemaMapping[destinationTableName] - switch event.Header.EventType { - case replication.WRITE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv0: - return errors.New("mysql v0 replication protocol not supported") - case replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2, replication.MARIADB_WRITE_ROWS_COMPRESSED_EVENT_V1: - for _, row := range ev.Rows { - items := model.NewRecordItems(len(row)) - for idx, val := range row { - fd := schema.Columns[idx] - items.AddColumn(fd.Name, QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) + if schema != nil { + switch event.Header.EventType { + case replication.WRITE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv0: + return errors.New("mysql v0 replication protocol not supported") + case replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2, replication.MARIADB_WRITE_ROWS_COMPRESSED_EVENT_V1: + for _, row := range ev.Rows { + items := model.NewRecordItems(len(row)) + for idx, val := range row { + fd := schema.Columns[idx] + val, err := QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val) + if err != nil { + return err + } + items.AddColumn(fd.Name, val) + } + + recordCount += 1 + if err := req.RecordStream.AddRecord(ctx, &model.InsertRecord[model.RecordItems]{ + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + Items: items, + SourceTableName: sourceTableName, + DestinationTableName: destinationTableName, + }); err != nil { + return err + } } - - recordCount += 1 - if err := req.RecordStream.AddRecord(ctx, &model.InsertRecord[model.RecordItems]{ - BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, - Items: items, - SourceTableName: sourceTableName, - DestinationTableName: destinationTableName, - }); err != nil { - return err - } - } - case replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2, replication.MARIADB_UPDATE_ROWS_COMPRESSED_EVENT_V1: - // TODO populate UnchangedToastColumns with ev.SkippedColumns - for idx := 0; idx < len(ev.Rows); idx += 2 { - oldRow := ev.Rows[idx] - oldItems := model.NewRecordItems(len(oldRow)) - for idx, val := range oldRow { - fd := schema.Columns[idx] - oldItems.AddColumn(fd.Name, QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) - } - newRow := ev.Rows[idx+1] - newItems := model.NewRecordItems(len(newRow)) - for idx, val := range ev.Rows[idx+1] { - fd := schema.Columns[idx] - newItems.AddColumn(fd.Name, QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) - } - - recordCount += 1 - if err := req.RecordStream.AddRecord(ctx, &model.UpdateRecord[model.RecordItems]{ - BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, - OldItems: oldItems, - NewItems: newItems, - SourceTableName: sourceTableName, - DestinationTableName: destinationTableName, - }); err != nil { - return err + case replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2, replication.MARIADB_UPDATE_ROWS_COMPRESSED_EVENT_V1: + // TODO populate UnchangedToastColumns with ev.SkippedColumns + for idx := 0; idx < len(ev.Rows); idx += 2 { + oldRow := ev.Rows[idx] + oldItems := model.NewRecordItems(len(oldRow)) + for idx, val := range oldRow { + fd := schema.Columns[idx] + val, err := QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val) + if err != nil { + return err + } + oldItems.AddColumn(fd.Name, val) + } + newRow := ev.Rows[idx+1] + newItems := model.NewRecordItems(len(newRow)) + for idx, val := range ev.Rows[idx+1] { + fd := schema.Columns[idx] + val, err := QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val) + if err != nil { + return err + } + newItems.AddColumn(fd.Name, val) + } + + recordCount += 1 + if err := req.RecordStream.AddRecord(ctx, &model.UpdateRecord[model.RecordItems]{ + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + OldItems: oldItems, + NewItems: newItems, + SourceTableName: sourceTableName, + DestinationTableName: destinationTableName, + }); err != nil { + return err + } } - } - case replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2, replication.MARIADB_DELETE_ROWS_COMPRESSED_EVENT_V1: - for _, row := range ev.Rows { - items := model.NewRecordItems(len(row)) - for idx, val := range row { - fd := schema.Columns[idx] - items.AddColumn(fd.Name, QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val)) - } - - recordCount += 1 - if err := req.RecordStream.AddRecord(ctx, &model.DeleteRecord[model.RecordItems]{ - BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, - Items: items, - SourceTableName: sourceTableName, - DestinationTableName: destinationTableName, - }); err != nil { - return err + case replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2, replication.MARIADB_DELETE_ROWS_COMPRESSED_EVENT_V1: + for _, row := range ev.Rows { + items := model.NewRecordItems(len(row)) + for idx, val := range row { + fd := schema.Columns[idx] + val, err := QValueFromMysqlRowEvent(ev.Table.ColumnType[idx], qvalue.QValueKind(fd.Type), val) + if err != nil { + return err + } + items.AddColumn(fd.Name, val) + } + + recordCount += 1 + if err := req.RecordStream.AddRecord(ctx, &model.DeleteRecord[model.RecordItems]{ + BaseRecord: model.BaseRecord{CommitTimeNano: int64(event.Header.Timestamp) * 1e9}, + Items: items, + SourceTableName: sourceTableName, + DestinationTableName: destinationTableName, + }); err != nil { + return err + } } + default: } - default: } } } return nil } -func QValueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) qvalue.QValue { +func QValueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) (qvalue.QValue, error) { // TODO signedness, in ev.Table, need to extend QValue system // See go-mysql row_event.go for mapping switch val := val.(type) { case nil: - return qvalue.QValueNull(qkind) + return qvalue.QValueNull(qkind), nil case int8: // TODO qvalue.Int8 - return qvalue.QValueInt16{Val: int16(val)} + return qvalue.QValueInt16{Val: int16(val)}, nil case int16: - return qvalue.QValueInt16{Val: val} + return qvalue.QValueInt16{Val: val}, nil case int32: - return qvalue.QValueInt32{Val: val} + return qvalue.QValueInt32{Val: val}, nil case int64: - return qvalue.QValueInt64{Val: val} + return qvalue.QValueInt64{Val: val}, nil case float32: - return qvalue.QValueFloat32{Val: val} + return qvalue.QValueFloat32{Val: val}, nil case float64: - return qvalue.QValueFloat64{Val: val} + return qvalue.QValueFloat64{Val: val}, nil case decimal.Decimal: - return qvalue.QValueNumeric{Val: val} + return qvalue.QValueNumeric{Val: val}, nil case int: // YEAR: https://dev.mysql.com/doc/refman/8.4/en/year.html - return qvalue.QValueInt16{Val: int16(val)} + return qvalue.QValueInt16{Val: int16(val)}, nil case time.Time: - return qvalue.QValueTimestamp{Val: val} + return qvalue.QValueTimestamp{Val: val}, nil case *replication.JsonDiff: // TODO support somehow?? - return qvalue.QValueNull(qvalue.QValueKindJSON) + return qvalue.QValueNull(qvalue.QValueKindJSON), nil case []byte: switch mytype { case mysql.MYSQL_TYPE_BLOB: - return qvalue.QValueBytes{Val: val} + return qvalue.QValueBytes{Val: val}, nil case mysql.MYSQL_TYPE_JSON: - return qvalue.QValueJSON{Val: string(val)} + return qvalue.QValueJSON{Val: string(val)}, nil case mysql.MYSQL_TYPE_GEOMETRY: // TODO figure out mysql geo encoding - return qvalue.QValueGeometry{Val: string(val)} + return qvalue.QValueGeometry{Val: string(val)}, nil } case string: switch mytype { @@ -440,12 +458,16 @@ func QValueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) qval case mysql.MYSQL_TYPE_TIME2: // TODO parse case mysql.MYSQL_TYPE_DATE: - // TODO parse + val, err := time.Parse(time.DateOnly, string(val)) + if err != nil { + return nil, err + } + return qvalue.QValueDate{Val: val}, nil case mysql.MYSQL_TYPE_VARCHAR, mysql.MYSQL_TYPE_VAR_STRING, mysql.MYSQL_TYPE_STRING: - return qvalue.QValueString{Val: val} + return qvalue.QValueString{Val: val}, nil } } - panic(fmt.Sprintf("unexpected type %T for mysql type %d", val, mytype)) + return nil, fmt.Errorf("unexpected type %T for mysql type %d", val, mytype) } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 635c9c1b83..e09e5e6b6d 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -270,13 +270,13 @@ func qkindFromMysql(ty uint8) (qvalue.QValueKind, error) { case mysql.MYSQL_TYPE_SET: return qvalue.QValueKindInt64, nil case mysql.MYSQL_TYPE_TINY_BLOB: - return qvalue.QValueKindString, nil + return qvalue.QValueKindBytes, nil case mysql.MYSQL_TYPE_MEDIUM_BLOB: - return qvalue.QValueKindString, nil + return qvalue.QValueKindBytes, nil case mysql.MYSQL_TYPE_LONG_BLOB: - return qvalue.QValueKindString, nil + return qvalue.QValueKindBytes, nil case mysql.MYSQL_TYPE_BLOB: - return qvalue.QValueKindString, nil + return qvalue.QValueKindBytes, nil case mysql.MYSQL_TYPE_VAR_STRING: return qvalue.QValueKindString, nil case mysql.MYSQL_TYPE_STRING: From 436307780e84d6137898d0806f51544a1dfd6aeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 14 Jan 2025 14:08:09 +0000 Subject: [PATCH 60/80] lint --- flow/connectors/mysql/cdc.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 0b65760f13..25a4ce8b92 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -458,7 +458,7 @@ func QValueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) (qva case mysql.MYSQL_TYPE_TIME2: // TODO parse case mysql.MYSQL_TYPE_DATE: - val, err := time.Parse(time.DateOnly, string(val)) + val, err := time.Parse(time.DateOnly, val) if err != nil { return nil, err } From e9da7ffcd1f692e25701e0ce56ff415910da5309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 14 Jan 2025 15:38:39 +0000 Subject: [PATCH 61/80] is text blob with charset? --- flow/connectors/mysql/cdc.go | 2 +- flow/connectors/mysql/mysql.go | 24 +++++++++--------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 25a4ce8b92..c5ebc0b825 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -71,7 +71,7 @@ func (c *MySqlConnector) getTableSchemaForTable( primary := make([]string, 0) for _, field := range rs.Fields { - qkind, err := qkindFromMysql(field.Type) + qkind, err := qkindFromMysql(field.Type, field.Charset) if err != nil { return nil, err } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index e09e5e6b6d..c899578ebe 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -219,7 +219,7 @@ func (c *MySqlConnector) GetVersion(ctx context.Context) (string, error) { return version, nil } -func qkindFromMysql(ty uint8) (qvalue.QValueKind, error) { +func qkindFromMysql(ty uint8, charset uint16) (qvalue.QValueKind, error) { switch ty { case mysql.MYSQL_TYPE_DECIMAL: return qvalue.QValueKindNumeric, nil @@ -234,7 +234,8 @@ func qkindFromMysql(ty uint8) (qvalue.QValueKind, error) { case mysql.MYSQL_TYPE_DOUBLE: return qvalue.QValueKindFloat64, nil case mysql.MYSQL_TYPE_NULL: - return qvalue.QValueKindInvalid, nil // TODO qvalue.QValueKindNothing + // TODO qvalue.QValueKindNothing, but don't think this can actually be column type + return qvalue.QValueKindInvalid, nil case mysql.MYSQL_TYPE_TIMESTAMP: return qvalue.QValueKindTimestamp, nil case mysql.MYSQL_TYPE_LONGLONG: @@ -269,17 +270,10 @@ func qkindFromMysql(ty uint8) (qvalue.QValueKind, error) { return qvalue.QValueKindInt64, nil case mysql.MYSQL_TYPE_SET: return qvalue.QValueKindInt64, nil - case mysql.MYSQL_TYPE_TINY_BLOB: + case mysql.MYSQL_TYPE_TINY_BLOB, mysql.MYSQL_TYPE_MEDIUM_BLOB, mysql.MYSQL_TYPE_LONG_BLOB, mysql.MYSQL_TYPE_BLOB: + slog.Info("mymymy blob", slog.Int("type", int(ty)), slog.Int("charset", int(charset))) return qvalue.QValueKindBytes, nil - case mysql.MYSQL_TYPE_MEDIUM_BLOB: - return qvalue.QValueKindBytes, nil - case mysql.MYSQL_TYPE_LONG_BLOB: - return qvalue.QValueKindBytes, nil - case mysql.MYSQL_TYPE_BLOB: - return qvalue.QValueKindBytes, nil - case mysql.MYSQL_TYPE_VAR_STRING: - return qvalue.QValueKindString, nil - case mysql.MYSQL_TYPE_STRING: + case mysql.MYSQL_TYPE_VAR_STRING, mysql.MYSQL_TYPE_STRING: return qvalue.QValueKindString, nil case mysql.MYSQL_TYPE_GEOMETRY: return qvalue.QValueKindGeometry, nil @@ -291,7 +285,7 @@ func qkindFromMysql(ty uint8) (qvalue.QValueKind, error) { func QRecordSchemaFromMysqlFields(fields []*mysql.Field) (qvalue.QRecordSchema, error) { schema := make([]qvalue.QField, 0, len(fields)) for _, field := range fields { - qkind, err := qkindFromMysql(field.Type) + qkind, err := qkindFromMysql(field.Type, field.Charset) if err != nil { return qvalue.QRecordSchema{}, err } @@ -308,7 +302,7 @@ func QRecordSchemaFromMysqlFields(fields []*mysql.Field) (qvalue.QRecordSchema, } func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qvalue.QValue, error) { - // TODO fill this in, maybe contribute upstream, figvure out how numeric etc fit in + // TODO fill this in, maybe contribute upstream, figure out how numeric etc fit in switch v := fv.Value().(type) { case nil: return qvalue.QValueNull(qkind), nil @@ -371,7 +365,7 @@ func QValueFromMysqlFieldValue(qkind qvalue.QValueKind, fv mysql.FieldValue) (qv } return qvalue.QValueDate{Val: val}, nil default: - return nil, fmt.Errorf("cannot convert string %v to %s", v, qkind) + return nil, fmt.Errorf("cannot convert bytes %v to %s", v, qkind) } default: return nil, fmt.Errorf("unexpected mysql type %T", v) From 4a03d8dba77e5f41bdd08ed18157202e5b4ea660 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 14 Jan 2025 16:05:36 +0000 Subject: [PATCH 62/80] use charset for blob --- flow/connectors/mysql/cdc.go | 2 +- flow/connectors/mysql/mysql.go | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index c5ebc0b825..86f8a0a4ad 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -71,7 +71,7 @@ func (c *MySqlConnector) getTableSchemaForTable( primary := make([]string, 0) for _, field := range rs.Fields { - qkind, err := qkindFromMysql(field.Type, field.Charset) + qkind, err := qkindFromMysql(field) if err != nil { return nil, err } diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index c899578ebe..06f5598f01 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -219,8 +219,8 @@ func (c *MySqlConnector) GetVersion(ctx context.Context) (string, error) { return version, nil } -func qkindFromMysql(ty uint8, charset uint16) (qvalue.QValueKind, error) { - switch ty { +func qkindFromMysql(field *mysql.Field) (qvalue.QValueKind, error) { + switch field.Type { case mysql.MYSQL_TYPE_DECIMAL: return qvalue.QValueKindNumeric, nil case mysql.MYSQL_TYPE_TINY: @@ -271,21 +271,24 @@ func qkindFromMysql(ty uint8, charset uint16) (qvalue.QValueKind, error) { case mysql.MYSQL_TYPE_SET: return qvalue.QValueKindInt64, nil case mysql.MYSQL_TYPE_TINY_BLOB, mysql.MYSQL_TYPE_MEDIUM_BLOB, mysql.MYSQL_TYPE_LONG_BLOB, mysql.MYSQL_TYPE_BLOB: - slog.Info("mymymy blob", slog.Int("type", int(ty)), slog.Int("charset", int(charset))) - return qvalue.QValueKindBytes, nil + if field.Charset == 0x3f { // binary https://dev.mysql.com/doc/dev/mysql-server/8.4.3/page_protocol_basic_character_set.html + return qvalue.QValueKindBytes, nil + } else { + return qvalue.QValueKindString, nil + } case mysql.MYSQL_TYPE_VAR_STRING, mysql.MYSQL_TYPE_STRING: return qvalue.QValueKindString, nil case mysql.MYSQL_TYPE_GEOMETRY: return qvalue.QValueKindGeometry, nil default: - return qvalue.QValueKind(""), fmt.Errorf("unknown mysql type %d", ty) + return qvalue.QValueKind(""), fmt.Errorf("unknown mysql type %d", field.Type) } } func QRecordSchemaFromMysqlFields(fields []*mysql.Field) (qvalue.QRecordSchema, error) { schema := make([]qvalue.QField, 0, len(fields)) for _, field := range fields { - qkind, err := qkindFromMysql(field.Type, field.Charset) + qkind, err := qkindFromMysql(field) if err != nil { return qvalue.QRecordSchema{}, err } From 99661ee64e104fcbc2a51f736f4e054d87bde273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 14 Jan 2025 16:54:35 +0000 Subject: [PATCH 63/80] don't trust mysql types also move GetRows for SuiteSource into interface --- flow/connectors/mysql/cdc.go | 32 ++++++------ flow/e2e/bigquery/peer_flow_bq_test.go | 6 +-- flow/e2e/congen.go | 2 + flow/e2e/mysql.go | 37 ++++++++++++++ flow/e2e/pg.go | 13 +++++ flow/e2e/snowflake/qrep_flow_sf_test.go | 2 +- flow/e2e/test_utils.go | 65 ++----------------------- 7 files changed, 77 insertions(+), 80 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 86f8a0a4ad..c49408c5f2 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -442,31 +442,35 @@ func QValueFromMysqlRowEvent(mytype byte, qkind qvalue.QValueKind, val any) (qva // TODO support somehow?? return qvalue.QValueNull(qvalue.QValueKindJSON), nil case []byte: - switch mytype { - case mysql.MYSQL_TYPE_BLOB: + switch qkind { + case qvalue.QValueKindBytes: return qvalue.QValueBytes{Val: val}, nil - case mysql.MYSQL_TYPE_JSON: + case qvalue.QValueKindString: + return qvalue.QValueString{Val: string(val)}, nil + case qvalue.QValueKindJSON: return qvalue.QValueJSON{Val: string(val)}, nil - case mysql.MYSQL_TYPE_GEOMETRY: + case qvalue.QValueKindGeometry: // TODO figure out mysql geo encoding return qvalue.QValueGeometry{Val: string(val)}, nil } case string: - switch mytype { - case mysql.MYSQL_TYPE_TIME: - // TODO parse - case mysql.MYSQL_TYPE_TIME2: - // TODO parse - case mysql.MYSQL_TYPE_DATE: + switch qkind { + case qvalue.QValueKindBytes: + return qvalue.QValueBytes{Val: []byte(val)}, nil + case qvalue.QValueKindString: + return qvalue.QValueString{Val: val}, nil + case qvalue.QValueKindJSON: + return qvalue.QValueJSON{Val: val}, nil + case qvalue.QValueKindGeometry: + // TODO figure out mysql geo encoding + return qvalue.QValueGeometry{Val: val}, nil + // TODO more time types + case qvalue.QValueKindDate: val, err := time.Parse(time.DateOnly, val) if err != nil { return nil, err } return qvalue.QValueDate{Val: val}, nil - case mysql.MYSQL_TYPE_VARCHAR, - mysql.MYSQL_TYPE_VAR_STRING, - mysql.MYSQL_TYPE_STRING: - return qvalue.QValueString{Val: val}, nil } } return nil, fmt.Errorf("unexpected type %T for mysql type %d", val, mytype) diff --git a/flow/e2e/bigquery/peer_flow_bq_test.go b/flow/e2e/bigquery/peer_flow_bq_test.go index a0a7d558c7..6ffc70fa68 100644 --- a/flow/e2e/bigquery/peer_flow_bq_test.go +++ b/flow/e2e/bigquery/peer_flow_bq_test.go @@ -1103,7 +1103,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Soft_Delete_Basic() { _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(`DELETE FROM %s WHERE id=1`, srcTableName)) e2e.EnvNoError(s.t, env, err) e2e.EnvWaitFor(s.t, env, 3*time.Minute, "normalize delete", func() bool { - pgRows, err := e2e.GetPgRows(s.conn, s.bqSuffix, srcName, "id,c1,c2,t") + pgRows, err := s.Source().GetRows(s.bqSuffix, srcName, "id,c1,c2,t") if err != nil { return false } @@ -1248,7 +1248,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Soft_Delete_UD_Same_Batch() { e2e.EnvNoError(s.t, env, insertTx.Commit(context.Background())) e2e.EnvWaitFor(s.t, env, 3*time.Minute, "normalize transaction", func() bool { - pgRows, err := e2e.GetPgRows(s.conn, s.bqSuffix, srcName, "id,c1,c2,t") + pgRows, err := s.Source().GetRows(s.bqSuffix, srcName, "id,c1,c2,t") e2e.EnvNoError(s.t, env, err) rows, err := s.GetRowsWhere(dstName, "id,c1,c2,t", "NOT _PEERDB_IS_DELETED") if err != nil { @@ -1312,7 +1312,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Soft_Delete_Insert_After_Delete() { "DELETE FROM %s WHERE id=1", srcTableName)) e2e.EnvNoError(s.t, env, err) e2e.EnvWaitFor(s.t, env, 3*time.Minute, "normalize delete", func() bool { - pgRows, err := e2e.GetPgRows(s.conn, s.bqSuffix, tableName, "id,c1,c2,t") + pgRows, err := s.Source().GetRows(s.bqSuffix, tableName, "id,c1,c2,t") if err != nil { return false } diff --git a/flow/e2e/congen.go b/flow/e2e/congen.go index 6898414c7a..efa5a349a8 100644 --- a/flow/e2e/congen.go +++ b/flow/e2e/congen.go @@ -9,6 +9,7 @@ import ( "github.com/PeerDB-io/peerdb/flow/connectors" "github.com/PeerDB-io/peerdb/flow/connectors/utils" "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/model" "github.com/PeerDB-io/peerdb/flow/peerdbenv" ) @@ -17,6 +18,7 @@ type SuiteSource interface { GeneratePeer(t *testing.T) *protos.Peer Connector() connectors.Connector Exec(sql string) error + GetRows(suffix, table, cols string) (*model.QRecordBatch, error) } func TableMappings(s GenericSuite, tables ...string) []*protos.TableMapping { diff --git a/flow/e2e/mysql.go b/flow/e2e/mysql.go index 2f37ae10e8..9b6e386cf7 100644 --- a/flow/e2e/mysql.go +++ b/flow/e2e/mysql.go @@ -7,7 +7,10 @@ import ( "github.com/PeerDB-io/peerdb/flow/connectors" "github.com/PeerDB-io/peerdb/flow/connectors/mysql" + "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/model" + "github.com/PeerDB-io/peerdb/flow/model/qvalue" ) type MySqlSource struct { @@ -82,3 +85,37 @@ func (s *MySqlSource) Exec(sql string) error { _, err := s.MySqlConnector.Execute(context.Background(), sql) return err } + +func (s *MySqlSource) GetRows(suffix string, table string, cols string) (*model.QRecordBatch, error) { + rs, err := s.MySqlConnector.Execute( + context.Background(), + fmt.Sprintf(`SELECT %s FROM "e2e_test_%s".%s ORDER BY id`, cols, suffix, connpostgres.QuoteIdentifier(table)), + ) + if err != nil { + return nil, err + } + + schema, err := connmysql.QRecordSchemaFromMysqlFields(rs.Fields) + if err != nil { + return nil, err + } + + batch := &model.QRecordBatch{ + Schema: schema, + Records: nil, + } + + for _, row := range rs.Values { + record := make([]qvalue.QValue, 0, len(row)) + for idx, val := range row { + qv, err := connmysql.QValueFromMysqlFieldValue(schema.Fields[idx].Type, val) + if err != nil { + return nil, err + } + record = append(record, qv) + } + batch.Records = append(batch.Records, record) + } + + return batch, nil +} diff --git a/flow/e2e/pg.go b/flow/e2e/pg.go index 5237d4ea72..e918151435 100644 --- a/flow/e2e/pg.go +++ b/flow/e2e/pg.go @@ -13,6 +13,7 @@ import ( "github.com/PeerDB-io/peerdb/flow/connectors" connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/generated/protos" + "github.com/PeerDB-io/peerdb/flow/model" "github.com/PeerDB-io/peerdb/flow/peerdbenv" ) @@ -191,3 +192,15 @@ func (s *PostgresSource) Exec(sql string) error { _, err := s.PostgresConnector.Conn().Exec(context.Background(), sql) return err } + +func (s *PostgresSource) GetRows(suffix string, table string, cols string) (*model.QRecordBatch, error) { + pgQueryExecutor, err := s.PostgresConnector.NewQRepQueryExecutor(context.Background(), "testflow", "testpart") + if err != nil { + return nil, err + } + + return pgQueryExecutor.ExecuteAndProcessQuery( + context.Background(), + fmt.Sprintf(`SELECT %s FROM e2e_test_%s.%s ORDER BY id`, cols, suffix, connpostgres.QuoteIdentifier(table)), + ) +} diff --git a/flow/e2e/snowflake/qrep_flow_sf_test.go b/flow/e2e/snowflake/qrep_flow_sf_test.go index 8835761659..e81ab61d8a 100644 --- a/flow/e2e/snowflake/qrep_flow_sf_test.go +++ b/flow/e2e/snowflake/qrep_flow_sf_test.go @@ -37,7 +37,7 @@ func (s PeerFlowE2ETestSuiteSF) checkJSONValue(tableName, colName, fieldName, va } func (s PeerFlowE2ETestSuiteSF) compareTableContentsWithDiffSelectorsSF(tableName, pgSelector, sfSelector string) { - pgRows, err := e2e.GetPgRows(s.conn, s.pgSuffix, tableName, pgSelector) + pgRows, err := s.Source().GetRows(s.pgSuffix, tableName, pgSelector) require.NoError(s.t, err) sfRows, err := s.GetRows(tableName, sfSelector) diff --git a/flow/e2e/test_utils.go b/flow/e2e/test_utils.go index 545b80a80c..71838b2c33 100644 --- a/flow/e2e/test_utils.go +++ b/flow/e2e/test_utils.go @@ -22,7 +22,6 @@ import ( "go.temporal.io/sdk/temporal" "github.com/PeerDB-io/peerdb/flow/connectors" - connmysql "github.com/PeerDB-io/peerdb/flow/connectors/mysql" connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" connsnowflake "github.com/PeerDB-io/peerdb/flow/connectors/snowflake" "github.com/PeerDB-io/peerdb/flow/e2eshared" @@ -90,69 +89,11 @@ func EnvTrue(t *testing.T, env WorkflowRun, val bool) { } } -func GetPgRows(conn *connpostgres.PostgresConnector, suffix string, table string, cols string) (*model.QRecordBatch, error) { - pgQueryExecutor, err := conn.NewQRepQueryExecutor(context.Background(), "testflow", "testpart") - if err != nil { - return nil, err - } - - return pgQueryExecutor.ExecuteAndProcessQuery( - context.Background(), - fmt.Sprintf(`SELECT %s FROM e2e_test_%s.%s ORDER BY id`, cols, suffix, connpostgres.QuoteIdentifier(table)), - ) -} - -func GetMySqlRows(conn *connmysql.MySqlConnector, suffix string, table string, cols string) (*model.QRecordBatch, error) { - rs, err := conn.Execute( - context.Background(), - fmt.Sprintf(`SELECT %s FROM "e2e_test_%s".%s ORDER BY id`, cols, suffix, connpostgres.QuoteIdentifier(table)), - ) - if err != nil { - return nil, err - } - - schema, err := connmysql.QRecordSchemaFromMysqlFields(rs.Fields) - if err != nil { - return nil, err - } - - batch := &model.QRecordBatch{ - Schema: schema, - Records: nil, - } - - for _, row := range rs.Values { - record := make([]qvalue.QValue, 0, len(row)) - for idx, val := range row { - qv, err := connmysql.QValueFromMysqlFieldValue(schema.Fields[idx].Type, val) - if err != nil { - return nil, err - } - record = append(record, qv) - } - batch.Records = append(batch.Records, record) - } - - return batch, nil -} - -func GetSuiteSourceRows(suite Suite, table string, cols string) (*model.QRecordBatch, error) { - // TODO move to SuiteSource - switch conn := any(suite.Source().Connector()).(type) { - case *connpostgres.PostgresConnector: - return GetPgRows(conn, suite.Suffix(), table, cols) - case *connmysql.MySqlConnector: - return GetMySqlRows(conn, suite.Suffix(), table, cols) - default: - panic("unknown connector type") - } -} - func RequireEqualTables(suite RowSource, table string, cols string) { t := suite.T() t.Helper() - sourceRows, err := GetSuiteSourceRows(suite, table, cols) + sourceRows, err := suite.Source().GetRows(suite.Suffix(), table, cols) require.NoError(t, err) rows, err := suite.GetRows(table, cols) @@ -171,7 +112,7 @@ func EnvEqualTablesWithNames( t := suite.T() t.Helper() - sourceRows, err := GetSuiteSourceRows(suite, srcTable, cols) + sourceRows, err := suite.Source().GetRows(suite.Suffix(), srcTable, cols) EnvNoError(t, env, err) rows, err := suite.GetRows(dstTable, cols) @@ -205,7 +146,7 @@ func EnvWaitForEqualTablesWithNames( EnvWaitFor(t, env, 3*time.Minute, reason, func() bool { t.Helper() - sourceRows, err := GetSuiteSourceRows(suite, srcTable, cols) + sourceRows, err := suite.Source().GetRows(suite.Suffix(), srcTable, cols) if err != nil { t.Log(err) return false From b076396b3709d94117a15b54bf6fd9bda6a9a064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Tue, 14 Jan 2025 20:34:45 +0000 Subject: [PATCH 64/80] e2e: snowflake does not like quoted columns, just avoid keyword in generic test --- flow/connectors/mysql/mysql.go | 8 ++++---- flow/e2e/generic/generic_test.go | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 06f5598f01..060e3b88ef 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -152,7 +152,7 @@ func (c *MySqlConnector) ExecuteSelectStreaming(ctx context.Context, cmd string, func (c *MySqlConnector) GetGtidModeOn(ctx context.Context) (bool, error) { if c.config.Flavor == mysql.MySQLFlavor { - rr, err := c.Execute(ctx, "select @@global.gtid_mode") + rr, err := c.Execute(ctx, "select @@gtid_mode") if err != nil { return false, err } @@ -190,13 +190,13 @@ func (c *MySqlConnector) GetMasterGTIDSet(ctx context.Context) (mysql.GTIDSet, e var query string switch c.config.Flavor { case mysql.MariaDBFlavor: - query = "select @@global.gtid_current_pos" + query = "select @@gtid_current_pos" default: - query = "select @@global.gtid_executed" + query = "select @@gtid_executed" } rr, err := c.Execute(ctx, query) if err != nil { - return nil, fmt.Errorf("failed to select @@global.gtid_executed: %w", err) + return nil, fmt.Errorf("failed to select @@gtid_executed: %w", err) } gx, err := rr.GetString(0, 0) if err != nil { diff --git a/flow/e2e/generic/generic_test.go b/flow/e2e/generic/generic_test.go index 193e4c92f7..ff1d3b5ac2 100644 --- a/flow/e2e/generic/generic_test.go +++ b/flow/e2e/generic/generic_test.go @@ -64,7 +64,7 @@ func (s Generic) Test_Simple_Flow() { require.NoError(t, s.Source().Exec(fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id SERIAL PRIMARY KEY, - "key" TEXT NOT NULL, + ky TEXT NOT NULL, value TEXT NOT NULL, myh %s NOT NULL ); @@ -86,12 +86,12 @@ func (s Generic) Test_Simple_Flow() { testKey := fmt.Sprintf("test_key_%d", i) testValue := fmt.Sprintf("test_value_%d", i) e2e.EnvNoError(t, env, s.Source().Exec( - fmt.Sprintf(`INSERT INTO %s("key", value, myh) VALUES ('%s', '%s', '"a"=>"b"')`, srcSchemaTable, testKey, testValue), + fmt.Sprintf(`INSERT INTO %s(ky, value, myh) VALUES ('%s', '%s', '"a"=>"b"')`, srcSchemaTable, testKey, testValue), )) } t.Log("Inserted 10 rows into the source table") - e2e.EnvWaitForEqualTablesWithNames(env, s, "normalizing 10 rows", srcTable, dstTable, `id,"key",value,myh`) + e2e.EnvWaitForEqualTablesWithNames(env, s, "normalizing 10 rows", srcTable, dstTable, `id,ky,value,myh`) env.Cancel() e2e.RequireEnvCanceled(t, env) } From 56f8ccc4ed7cdd8cbf69ad0ab769f8cde4f977ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 15 Jan 2025 01:01:11 +0000 Subject: [PATCH 65/80] disable gtid on maria, log binary log status --- flow/connectors/mysql/mysql.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index 060e3b88ef..a1ff070ad0 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -165,13 +165,13 @@ func (c *MySqlConnector) GetGtidModeOn(ctx context.Context) (bool, error) { return gtid_mode == "ON", nil } else { // TODO how do we choose with mariadb? - return true, nil + return false, nil } } func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, error) { showBinlogStatus := "SHOW BINARY LOG STATUS" - if eq, err := c.conn.CompareServerVersion("8.4.0"); (err == nil) && (eq < 0) { + if eq, err := c.conn.CompareServerVersion("8.4.0"); err == nil && eq < 0 { showBinlogStatus = "SHOW MASTER STATUS" } @@ -180,8 +180,12 @@ func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, erro return mysql.Position{}, fmt.Errorf("failed to SHOW BINARY LOG STATUS: %w", err) } + slog.Info("mymymy binary log status", + slog.Any("a0", rr.Values[0][0].Value()), slog.Any("a1", rr.Values[0][1].Value()), + slog.Any("a2", rr.Values[0][2].Value()), slog.Any("a3", rr.Values[0][3].Value()), + slog.Any("a4", rr.Values[0][4].Value())) name, _ := rr.GetString(0, 0) - pos, _ := rr.GetInt(0, 1) + pos, _ := rr.GetUint(0, 1) return mysql.Position{Name: name, Pos: uint32(pos)}, nil } From 6d766d9a70a085a324916b94ce9aeff80d76ed4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 15 Jan 2025 03:43:46 +0000 Subject: [PATCH 66/80] run e2e vs both maria & mysql --- .github/workflows/flow.yml | 6 ++++ flow/connectors/mysql/cdc.go | 2 +- flow/connectors/mysql/mysql.go | 9 ++--- flow/e2e/clickhouse/peer_flow_ch_test.go | 4 +++ flow/e2e/generic/generic_test.go | 4 +++ flow/e2e/mysql.go | 43 +++++++++++++++++++++--- 6 files changed, 56 insertions(+), 12 deletions(-) diff --git a/.github/workflows/flow.yml b/.github/workflows/flow.yml index 0c752221c0..f5fb6e0ce6 100644 --- a/.github/workflows/flow.yml +++ b/.github/workflows/flow.yml @@ -30,6 +30,12 @@ jobs: - 3306:3306 env: MYSQL_ROOT_PASSWORD: cipass + mariadb: + image: mariadb:lts-ubi + ports: + - 3300:3306 + env: + MARIADB_ROOT_PASSWORD: cipass redpanda: image: redpandadata/redpanda@sha256:7214ddaf8426d25936459cf77c1f905566a4483a97d2b13006120dcd98a5c846 ports: diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index c49408c5f2..5c70cbfe37 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -287,7 +287,7 @@ func (c *MySqlConnector) PullRecords( switch ev := event.Event.(type) { case *replication.RotateEvent: - if gset == nil { + if gset == nil && event.Header.Timestamp != 0 { req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%d", string(ev.NextLogName), ev.Position)) } case *replication.MariadbGTIDEvent: diff --git a/flow/connectors/mysql/mysql.go b/flow/connectors/mysql/mysql.go index a1ff070ad0..7eae5d24d7 100644 --- a/flow/connectors/mysql/mysql.go +++ b/flow/connectors/mysql/mysql.go @@ -164,8 +164,8 @@ func (c *MySqlConnector) GetGtidModeOn(ctx context.Context) (bool, error) { return gtid_mode == "ON", nil } else { - // TODO how do we choose with mariadb? - return false, nil + // mariadb always enabled: https://mariadb.com/kb/en/gtid/#using-global-transaction-ids + return true, nil } } @@ -180,13 +180,8 @@ func (c *MySqlConnector) GetMasterPos(ctx context.Context) (mysql.Position, erro return mysql.Position{}, fmt.Errorf("failed to SHOW BINARY LOG STATUS: %w", err) } - slog.Info("mymymy binary log status", - slog.Any("a0", rr.Values[0][0].Value()), slog.Any("a1", rr.Values[0][1].Value()), - slog.Any("a2", rr.Values[0][2].Value()), slog.Any("a3", rr.Values[0][3].Value()), - slog.Any("a4", rr.Values[0][4].Value())) name, _ := rr.GetString(0, 0) pos, _ := rr.GetUint(0, 1) - return mysql.Position{Name: name, Pos: uint32(pos)}, nil } diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index c058397311..7d55a5d408 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -35,6 +35,10 @@ func TestPeerFlowE2ETestSuiteMySQL_CH(t *testing.T) { e2eshared.RunSuite(t, SetupSuite(t, e2e.SetupMySQL)) } +func TestPeerFlowE2ETestSuiteMariaDB_CH(t *testing.T) { + e2eshared.RunSuite(t, SetupSuite(t, e2e.SetupMariaDB)) +} + func (s ClickHouseSuite) attachSchemaSuffix(tableName string) string { return fmt.Sprintf("e2e_test_%s.%s", s.suffix, tableName) } diff --git a/flow/e2e/generic/generic_test.go b/flow/e2e/generic/generic_test.go index ff1d3b5ac2..b35cae86ef 100644 --- a/flow/e2e/generic/generic_test.go +++ b/flow/e2e/generic/generic_test.go @@ -40,6 +40,10 @@ func TestGenericCH_MySQL(t *testing.T) { e2eshared.RunSuite(t, SetupGenericSuite(e2e_clickhouse.SetupSuite(t, e2e.SetupMySQL))) } +func TestGenericCH_MariaDB(t *testing.T) { + e2eshared.RunSuite(t, SetupGenericSuite(e2e_clickhouse.SetupSuite(t, e2e.SetupMariaDB))) +} + type Generic struct { e2e.GenericSuite } diff --git a/flow/e2e/mysql.go b/flow/e2e/mysql.go index 9b6e386cf7..fa215769c7 100644 --- a/flow/e2e/mysql.go +++ b/flow/e2e/mysql.go @@ -5,6 +5,8 @@ import ( "fmt" "testing" + "github.com/go-mysql-org/go-mysql/mysql" + "github.com/PeerDB-io/peerdb/flow/connectors" "github.com/PeerDB-io/peerdb/flow/connectors/mysql" "github.com/PeerDB-io/peerdb/flow/connectors/postgres" @@ -15,6 +17,7 @@ import ( type MySqlSource struct { *connmysql.MySqlConnector + isMaria bool } var mysqlConfig = &protos.MySqlConfig{ @@ -26,13 +29,40 @@ var mysqlConfig = &protos.MySqlConfig{ Setup: nil, Compression: 0, DisableTls: true, - Flavor: "mysql", + Flavor: mysql.MySQLFlavor, +} + +var mariaConfig = &protos.MySqlConfig{ + Host: "localhost", + Port: 3300, + User: "root", + Password: "cipass", + Database: "", + Setup: nil, + Compression: 0, + DisableTls: true, + Flavor: mysql.MariaDBFlavor, } func SetupMySQL(t *testing.T, suffix string) (*MySqlSource, error) { t.Helper() + return setupMyCore(t, suffix, false) +} + +func SetupMariaDB(t *testing.T, suffix string) (*MySqlSource, error) { + t.Helper() + return setupMyCore(t, suffix, true) +} + +func setupMyCore(t *testing.T, suffix string, isMaria bool) (*MySqlSource, error) { + t.Helper() + + config := mysqlConfig + if isMaria { + config = mariaConfig + } - connector, err := connmysql.NewMySqlConnector(context.Background(), mysqlConfig) + connector, err := connmysql.NewMySqlConnector(context.Background(), config) if err != nil { return nil, fmt.Errorf("failed to create postgres connection: %w", err) } @@ -70,11 +100,16 @@ func (s *MySqlSource) Teardown(t *testing.T, suffix string) { func (s *MySqlSource) GeneratePeer(t *testing.T) *protos.Peer { t.Helper() + config := mysqlConfig + if s.isMaria { + config = mariaConfig + } + peer := &protos.Peer{ - Name: "mysql", + Name: config.Flavor, Type: protos.DBType_MYSQL, Config: &protos.Peer_MysqlConfig{ - MysqlConfig: mysqlConfig, + MysqlConfig: config, }, } CreatePeer(t, peer) From eb43083e0792b77034b1641a8636671b0f1b6c5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Wed, 15 Jan 2025 19:22:23 +0000 Subject: [PATCH 67/80] I'm a fool --- flow/connectors/mysql/cdc.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 5c70cbfe37..27353b2d03 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -137,7 +137,7 @@ func (c *MySqlConnector) SetupReplConn(ctx context.Context) error { if err != nil { return fmt.Errorf("[mysql] SetupReplConn failed to GetMasterPos: %w", err) } - lastOffsetText = fmt.Sprintf("!f:%s,%d", pos.Name, pos.Pos) + lastOffsetText = fmt.Sprintf("!f:%s,%x", pos.Name, pos.Pos) } if err := c.SetLastOffset( ctx, flowName, model.CdcCheckpoint{Text: lastOffsetText}, @@ -288,7 +288,7 @@ func (c *MySqlConnector) PullRecords( switch ev := event.Event.(type) { case *replication.RotateEvent: if gset == nil && event.Header.Timestamp != 0 { - req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%d", string(ev.NextLogName), ev.Position)) + req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%x", string(ev.NextLogName), ev.Position)) } case *replication.MariadbGTIDEvent: if gset != nil { From ec248fde238c396b185e1216f36216f376154eb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 16 Jan 2025 02:45:40 +0000 Subject: [PATCH 68/80] i give up --- .github/workflows/flow.yml | 12 ++++++------ flow/e2e/mysql.go | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/flow.yml b/.github/workflows/flow.yml index f5fb6e0ce6..e9433e8763 100644 --- a/.github/workflows/flow.yml +++ b/.github/workflows/flow.yml @@ -30,12 +30,12 @@ jobs: - 3306:3306 env: MYSQL_ROOT_PASSWORD: cipass - mariadb: - image: mariadb:lts-ubi - ports: - - 3300:3306 - env: - MARIADB_ROOT_PASSWORD: cipass + #mariadb: + # image: mariadb:lts-ubi + # ports: + # - 3300:3306 + # env: + # MARIADB_ROOT_PASSWORD: cipass redpanda: image: redpandadata/redpanda@sha256:7214ddaf8426d25936459cf77c1f905566a4483a97d2b13006120dcd98a5c846 ports: diff --git a/flow/e2e/mysql.go b/flow/e2e/mysql.go index fa215769c7..bf48bfa099 100644 --- a/flow/e2e/mysql.go +++ b/flow/e2e/mysql.go @@ -51,6 +51,7 @@ func SetupMySQL(t *testing.T, suffix string) (*MySqlSource, error) { func SetupMariaDB(t *testing.T, suffix string) (*MySqlSource, error) { t.Helper() + t.Skip("skipping until working out how to not have port conflict in GH actions") return setupMyCore(t, suffix, true) } From eec16a3d7b5402dafc7988d40a6653ba2a085750 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Thu, 16 Jan 2025 14:42:18 +0000 Subject: [PATCH 69/80] remove comment --- flow/workflows/cdc_flow.go | 6 +----- flow/workflows/snapshot_flow.go | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/flow/workflows/cdc_flow.go b/flow/workflows/cdc_flow.go index 800ffde158..7615f0fe63 100644 --- a/flow/workflows/cdc_flow.go +++ b/flow/workflows/cdc_flow.go @@ -428,11 +428,7 @@ func CDCFlowWorkflow( WaitForCancellation: true, } snapshotFlowCtx := workflow.WithChildOptions(ctx, childSnapshotFlowOpts) - snapshotFlowFuture := workflow.ExecuteChildWorkflow( - snapshotFlowCtx, - SnapshotFlowWorkflow, - cfg, - ) + snapshotFlowFuture := workflow.ExecuteChildWorkflow(snapshotFlowCtx, SnapshotFlowWorkflow, cfg) if err := snapshotFlowFuture.Get(snapshotFlowCtx, nil); err != nil { logger.Error("snapshot flow failed", slog.Any("error", err)) return state, fmt.Errorf("failed to execute snapshot workflow: %w", err) diff --git a/flow/workflows/snapshot_flow.go b/flow/workflows/snapshot_flow.go index a1425358d6..74728c1f61 100644 --- a/flow/workflows/snapshot_flow.go +++ b/flow/workflows/snapshot_flow.go @@ -32,7 +32,6 @@ type SnapshotFlowExecution struct { logger log.Logger } -// ensurePullability ensures that the source peer is pullable. func (s *SnapshotFlowExecution) setupReplication( ctx workflow.Context, ) (*protos.SetupReplicationOutput, error) { From 327a59d781fa9c2243ef9bb7d0059f2a5bf7d36f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 01:39:09 +0000 Subject: [PATCH 70/80] remove postgres SlotSignal this avoids an unnecessary goroutine, instead of having a goroutine wait on channel close to close connection just close connection but the larger reason I'm doing this is so that this logic can move behind a connector interface, where mysql will use this as an opportunity to SHOW BINARY LOG STATUS --- flow/activities/snapshot_activity.go | 20 +++++----- flow/connectors/postgres/client.go | 53 ++++++++++--------------- flow/connectors/postgres/postgres.go | 24 ++++++----- flow/connectors/postgres/slot_signal.go | 24 ----------- flow/e2e/postgres/qrep_flow_pg_test.go | 14 +++---- 5 files changed, 50 insertions(+), 85 deletions(-) delete mode 100644 flow/connectors/postgres/slot_signal.go diff --git a/flow/activities/snapshot_activity.go b/flow/activities/snapshot_activity.go index 3df7926ea6..bb380d9f8a 100644 --- a/flow/activities/snapshot_activity.go +++ b/flow/activities/snapshot_activity.go @@ -8,6 +8,7 @@ import ( "sync" "time" + "github.com/jackc/pgx/v5" "github.com/jackc/pgx/v5/pgxpool" "go.temporal.io/sdk/activity" @@ -20,7 +21,7 @@ import ( type SlotSnapshotState struct { connector connectors.CDCPullConnector - signal connpostgres.SlotSignal + slotConn *pgx.Conn snapshotName string } @@ -43,7 +44,9 @@ func (a *SnapshotActivity) CloseSlotKeepAlive(ctx context.Context, flowJobName s defer a.SnapshotStatesMutex.Unlock() if s, ok := a.SlotSnapshotStates[flowJobName]; ok { - close(s.signal.CloneComplete) + if s.slotConn != nil { + s.slotConn.Close(ctx) + } connectors.CloseConnector(ctx, s.connector) delete(a.SlotSnapshotStates, flowJobName) } @@ -76,15 +79,12 @@ func (a *SnapshotActivity) SetupReplication( connectors.CloseConnector(ctx, conn) } - slotSignal := connpostgres.NewSlotSignal() - go conn.SetupReplication(ctx, slotSignal, config) - logger.Info("waiting for slot to be created...") - slotInfo := <-slotSignal.SlotCreated + slotInfo, err := conn.SetupReplication(ctx, config) - if slotInfo.Err != nil { - closeConnectionForError(slotInfo.Err) - return nil, fmt.Errorf("slot error: %w", slotInfo.Err) + if err != nil { + closeConnectionForError(err) + return nil, fmt.Errorf("slot error: %w", err) } else { logger.Info("slot created", slog.String("SlotName", slotInfo.SlotName)) } @@ -93,7 +93,7 @@ func (a *SnapshotActivity) SetupReplication( defer a.SnapshotStatesMutex.Unlock() a.SlotSnapshotStates[config.FlowJobName] = SlotSnapshotState{ - signal: slotSignal, + slotConn: slotInfo.Conn, snapshotName: slotInfo.SnapshotName, connector: conn, } diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go index cfda1d2a01..ce993ecd0c 100644 --- a/flow/connectors/postgres/client.go +++ b/flow/connectors/postgres/client.go @@ -356,13 +356,12 @@ func (c *PostgresConnector) CreatePublication( // createSlotAndPublication creates the replication slot and publication. func (c *PostgresConnector) createSlotAndPublication( ctx context.Context, - signal SlotSignal, s SlotCheckResult, slot string, publication string, tableNameMapping map[string]model.NameAndExclude, doInitialCopy bool, -) { +) (SlotCreationResult, error) { // iterate through source tables and create publication, // expecting tablenames to be schema qualified if !s.PublicationExists { @@ -370,16 +369,12 @@ func (c *PostgresConnector) createSlotAndPublication( for srcTableName := range tableNameMapping { parsedSrcTableName, err := utils.ParseSchemaTable(srcTableName) if err != nil { - signal.SlotCreated <- SlotCreationResult{ - Err: fmt.Errorf("[publication-creation] source table identifier %s is invalid", srcTableName), - } - return + return SlotCreationResult{}, fmt.Errorf("[publication-creation] source table identifier %s is invalid", srcTableName) } srcTableNames = append(srcTableNames, parsedSrcTableName.String()) } if err := c.CreatePublication(ctx, srcTableNames, publication); err != nil { - signal.SlotCreated <- SlotCreationResult{Err: err} - return + return SlotCreationResult{}, err } } @@ -387,22 +382,20 @@ func (c *PostgresConnector) createSlotAndPublication( if !s.SlotExists { conn, err := c.CreateReplConn(ctx) if err != nil { - signal.SlotCreated <- SlotCreationResult{Err: fmt.Errorf("[slot] error acquiring connection: %w", err)} - return + return SlotCreationResult{}, fmt.Errorf("[slot] error acquiring connection: %w", err) } - defer conn.Close(ctx) c.logger.Warn(fmt.Sprintf("Creating replication slot '%s'", slot)) // THIS IS NOT IN A TX! if _, err := conn.Exec(ctx, "SET idle_in_transaction_session_timeout=0"); err != nil { - signal.SlotCreated <- SlotCreationResult{Err: fmt.Errorf("[slot] error setting idle_in_transaction_session_timeout: %w", err)} - return + conn.Close(ctx) + return SlotCreationResult{}, fmt.Errorf("[slot] error setting idle_in_transaction_session_timeout: %w", err) } if _, err := conn.Exec(ctx, "SET lock_timeout=0"); err != nil { - signal.SlotCreated <- SlotCreationResult{Err: fmt.Errorf("[slot] error setting lock_timeout: %w", err)} - return + conn.Close(ctx) + return SlotCreationResult{}, fmt.Errorf("[slot] error setting lock_timeout: %w", err) } opts := pglogrepl.CreateReplicationSlotOptions{ @@ -411,39 +404,33 @@ func (c *PostgresConnector) createSlotAndPublication( } res, err := pglogrepl.CreateReplicationSlot(ctx, conn.PgConn(), slot, "pgoutput", opts) if err != nil { - signal.SlotCreated <- SlotCreationResult{Err: fmt.Errorf("[slot] error creating replication slot: %w", err)} - return + conn.Close(ctx) + return SlotCreationResult{}, fmt.Errorf("[slot] error creating replication slot: %w", err) } pgversion, err := c.MajorVersion(ctx) if err != nil { - signal.SlotCreated <- SlotCreationResult{Err: fmt.Errorf("[slot] error getting PG version: %w", err)} - return + conn.Close(ctx) + return SlotCreationResult{}, fmt.Errorf("[slot] error getting PG version: %w", err) } c.logger.Info(fmt.Sprintf("Created replication slot '%s'", slot)) - slotDetails := SlotCreationResult{ + return SlotCreationResult{ SlotName: res.SlotName, SnapshotName: res.SnapshotName, - Err: nil, SupportsTIDScans: pgversion >= shared.POSTGRES_13, - } - signal.SlotCreated <- slotDetails - c.logger.Info("Waiting for clone to complete") - <-signal.CloneComplete - c.logger.Info("Clone complete") + }, nil } else { c.logger.Info(fmt.Sprintf("Replication slot '%s' already exists", slot)) - slotDetails := SlotCreationResult{ + var err error + if doInitialCopy { + err = ErrSlotAlreadyExists + } + return SlotCreationResult{ SlotName: slot, SnapshotName: "", - Err: nil, SupportsTIDScans: false, - } - if doInitialCopy { - slotDetails.Err = ErrSlotAlreadyExists - } - signal.SlotCreated <- slotDetails + }, err } } diff --git a/flow/connectors/postgres/postgres.go b/flow/connectors/postgres/postgres.go index 24cfcff9f9..5a56005222 100644 --- a/flow/connectors/postgres/postgres.go +++ b/flow/connectors/postgres/postgres.go @@ -57,6 +57,13 @@ type ReplState struct { LastOffset atomic.Int64 } +type SlotCreationResult struct { + Conn *pgx.Conn + SlotName string + SnapshotName string + SupportsTIDScans bool +} + func NewPostgresConnector(ctx context.Context, env map[string]string, pgConfig *protos.PostgresConfig) (*PostgresConnector, error) { logger := shared.LoggerFromCtx(ctx) flowNameInApplicationName, err := peerdbenv.PeerDBApplicationNamePerMirrorName(ctx, nil) @@ -1088,13 +1095,13 @@ func (c *PostgresConnector) FinishExport(tx any) error { return pgtx.Commit(timeout) } -// SetupReplication sets up replication for the source connector. -func (c *PostgresConnector) SetupReplication(ctx context.Context, signal SlotSignal, req *protos.SetupReplicationInput) { +// SetupReplication sets up replication for the source connector +func (c *PostgresConnector) SetupReplication( + ctx context.Context, + req *protos.SetupReplicationInput, +) (SlotCreationResult, error) { if !shared.IsValidReplicationName(req.FlowJobName) { - signal.SlotCreated <- SlotCreationResult{ - Err: fmt.Errorf("invalid flow job name: `%s`, it should be ^[a-z_][a-z0-9_]*$", req.FlowJobName), - } - return + return SlotCreationResult{}, fmt.Errorf("invalid flow job name: `%s`, it should be ^[a-z_][a-z0-9_]*$", req.FlowJobName) } // Slotname would be the job name prefixed with "peerflow_slot_" @@ -1111,8 +1118,7 @@ func (c *PostgresConnector) SetupReplication(ctx context.Context, signal SlotSig // Check if the replication slot and publication exist exists, err := c.checkSlotAndPublication(ctx, slotName, publicationName) if err != nil { - signal.SlotCreated <- SlotCreationResult{Err: err} - return + return SlotCreationResult{}, err } tableNameMapping := make(map[string]model.NameAndExclude, len(req.TableNameMapping)) @@ -1123,7 +1129,7 @@ func (c *PostgresConnector) SetupReplication(ctx context.Context, signal SlotSig } } // Create the replication slot and publication - c.createSlotAndPublication(ctx, signal, exists, slotName, publicationName, tableNameMapping, req.DoInitialSnapshot) + return c.createSlotAndPublication(ctx, exists, slotName, publicationName, tableNameMapping, req.DoInitialSnapshot) } func (c *PostgresConnector) PullFlowCleanup(ctx context.Context, jobName string) error { diff --git a/flow/connectors/postgres/slot_signal.go b/flow/connectors/postgres/slot_signal.go deleted file mode 100644 index bd5bcbbd52..0000000000 --- a/flow/connectors/postgres/slot_signal.go +++ /dev/null @@ -1,24 +0,0 @@ -package connpostgres - -type SlotCreationResult struct { - Err error - SlotName string - SnapshotName string - SupportsTIDScans bool -} - -// This struct contains two signals. -// 1. SlotCreated - this can be waited on to ensure that the slot has been created. -// 2. CloneComplete - which can be waited on to ensure that the clone has completed. -type SlotSignal struct { - SlotCreated chan SlotCreationResult - CloneComplete chan struct{} -} - -// NewSlotSignal returns a new SlotSignal. -func NewSlotSignal() SlotSignal { - return SlotSignal{ - SlotCreated: make(chan SlotCreationResult, 1), - CloneComplete: make(chan struct{}), - } -} diff --git a/flow/e2e/postgres/qrep_flow_pg_test.go b/flow/e2e/postgres/qrep_flow_pg_test.go index 875e8a0fd6..32bd78b116 100644 --- a/flow/e2e/postgres/qrep_flow_pg_test.go +++ b/flow/e2e/postgres/qrep_flow_pg_test.go @@ -12,7 +12,6 @@ import ( "github.com/jackc/pgx/v5/pgtype" "github.com/stretchr/testify/require" - connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/e2e" "github.com/PeerDB-io/peerdb/flow/e2eshared" "github.com/PeerDB-io/peerdb/flow/generated/protos" @@ -159,16 +158,13 @@ func (s PeerFlowE2ETestSuitePG) TestSimpleSlotCreation() { }, } - signal := connpostgres.NewSlotSignal() - go s.conn.SetupReplication(context.Background(), signal, setupReplicationInput) - s.t.Log("waiting for slot creation to complete: " + flowJobName) - slotInfo := <-signal.SlotCreated - s.t.Logf("slot creation complete: %v. Signaling clone complete in 2 seconds", slotInfo) - time.Sleep(2 * time.Second) - close(signal.CloneComplete) + slotInfo, err := s.conn.SetupReplication(context.Background(), setupReplicationInput) + require.NoError(s.t, err) + + s.t.Logf("slot creation complete: %v", slotInfo) + require.NoError(s.t, slotInfo.Conn.Close(context.Background())) - require.NoError(s.t, slotInfo.Err) s.t.Logf("successfully setup replication: %s", flowJobName) } From 57b4c5f36994246d8698c68c45d00a42c2e9bed3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 02:22:02 +0000 Subject: [PATCH 71/80] move mysql offset setup to SetupReplication make SetupReplication a bit more abstract --- flow/activities/snapshot_activity.go | 26 +++++--------- flow/connectors/core.go | 3 ++ flow/connectors/mysql/cdc.go | 53 ++++++++++++++-------------- flow/connectors/postgres/client.go | 24 ++++++------- flow/connectors/postgres/postgres.go | 13 ++----- flow/model/model.go | 8 +++++ 6 files changed, 60 insertions(+), 67 deletions(-) diff --git a/flow/activities/snapshot_activity.go b/flow/activities/snapshot_activity.go index bb380d9f8a..3312eae63a 100644 --- a/flow/activities/snapshot_activity.go +++ b/flow/activities/snapshot_activity.go @@ -2,26 +2,23 @@ package activities import ( "context" - "errors" "fmt" "log/slog" "sync" "time" - "github.com/jackc/pgx/v5" "github.com/jackc/pgx/v5/pgxpool" "go.temporal.io/sdk/activity" "github.com/PeerDB-io/peerdb/flow/alerting" "github.com/PeerDB-io/peerdb/flow/connectors" - connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/generated/protos" "github.com/PeerDB-io/peerdb/flow/shared" ) type SlotSnapshotState struct { - connector connectors.CDCPullConnector - slotConn *pgx.Conn + connector connectors.CDCPullConnectorCore + slotConn interface{ Close(context.Context) error } snapshotName string } @@ -64,27 +61,22 @@ func (a *SnapshotActivity) SetupReplication( a.Alerter.LogFlowEvent(ctx, config.FlowJobName, "Started Snapshot Flow Job") - conn, err := connectors.GetByNameAs[*connpostgres.PostgresConnector](ctx, nil, a.CatalogPool, config.PeerName) + conn, err := connectors.GetByNameAs[connectors.CDCPullConnectorCore](ctx, nil, a.CatalogPool, config.PeerName) if err != nil { - if errors.Is(err, errors.ErrUnsupported) { - logger.Info("setup replication is no-op for non-postgres source") - return nil, nil - } return nil, fmt.Errorf("failed to get connector: %w", err) } - closeConnectionForError := func(err error) { - a.Alerter.LogFlowError(ctx, config.FlowJobName, err) - // it is important to close the connection here as it is not closed in CloseSlotKeepAlive - connectors.CloseConnector(ctx, conn) - } - logger.Info("waiting for slot to be created...") slotInfo, err := conn.SetupReplication(ctx, config) if err != nil { - closeConnectionForError(err) + a.Alerter.LogFlowError(ctx, config.FlowJobName, err) + // it is important to close the connection here as it is not closed in CloseSlotKeepAlive + connectors.CloseConnector(ctx, conn) return nil, fmt.Errorf("slot error: %w", err) + } else if slotInfo.Conn == nil && slotInfo.SlotName == "" { + logger.Info("replication setup without slot") + return nil, nil } else { logger.Info("slot created", slog.String("SlotName", slotInfo.SlotName)) } diff --git a/flow/connectors/core.go b/flow/connectors/core.go index 8c5539dec5..cfdf7eacc9 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -67,6 +67,9 @@ type CDCPullConnectorCore interface { // `any` from ExportSnapshot passed here when done, allowing transaction to commit FinishExport(any) error + // Setup replication in prep for initial copy + SetupReplication(context.Context, *protos.SetupReplicationInput) (model.SetupReplicationResult, error) + // Methods related to retrieving and pushing records for this connector as a source and destination. SetupReplConn(context.Context) error diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 27353b2d03..7e5652d4a2 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -113,38 +113,39 @@ func (c *MySqlConnector) FinishExport(any) error { return nil } -func (c *MySqlConnector) SetupReplConn(ctx context.Context) error { - // mysql code will spin up new connection for each normalize for now - flowName := ctx.Value(shared.FlowNameKey).(string) - offset, err := c.GetLastOffset(ctx, flowName) +func (c *MySqlConnector) SetupReplication( + ctx context.Context, + req *protos.SetupReplicationInput, +) (model.SetupReplicationResult, error) { + gtidModeOn, err := c.GetGtidModeOn(ctx) if err != nil { - return fmt.Errorf("[mysql] SetupReplConn failed to GetLastOffset: %w", err) + return model.SetupReplicationResult{}, fmt.Errorf("[mysql] SetupReplication failed to get gtid_mode: %w", err) } - if offset.Text == "" { - gtidModeOn, err := c.GetGtidModeOn(ctx) + var lastOffsetText string + if gtidModeOn { + set, err := c.GetMasterGTIDSet(ctx) if err != nil { - return err - } - var lastOffsetText string - if gtidModeOn { - set, err := c.GetMasterGTIDSet(ctx) - if err != nil { - return fmt.Errorf("[mysql] SetupReplConn failed to GetMasterGTIDSet: %w", err) - } - lastOffsetText = set.String() - } else { - pos, err := c.GetMasterPos(ctx) - if err != nil { - return fmt.Errorf("[mysql] SetupReplConn failed to GetMasterPos: %w", err) - } - lastOffsetText = fmt.Sprintf("!f:%s,%x", pos.Name, pos.Pos) + return model.SetupReplicationResult{}, fmt.Errorf("[mysql] SetupReplication failed to GetMasterGTIDSet: %w", err) } - if err := c.SetLastOffset( - ctx, flowName, model.CdcCheckpoint{Text: lastOffsetText}, - ); err != nil { - return fmt.Errorf("[mysql] SetupReplConn failed to SetLastOffset: %w", err) + lastOffsetText = set.String() + } else { + pos, err := c.GetMasterPos(ctx) + if err != nil { + return model.SetupReplicationResult{}, fmt.Errorf("[mysql] SetupReplication failed to GetMasterPos: %w", err) } + lastOffsetText = fmt.Sprintf("!f:%s,%x", pos.Name, pos.Pos) } + if err := c.SetLastOffset( + ctx, req.FlowJobName, model.CdcCheckpoint{Text: lastOffsetText}, + ); err != nil { + return model.SetupReplicationResult{}, fmt.Errorf("[mysql] SetupReplication failed to SetLastOffset: %w", err) + } + + return model.SetupReplicationResult{}, nil +} + +func (c *MySqlConnector) SetupReplConn(ctx context.Context) error { + // mysql code will spin up new connection for each normalize for now return nil } diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go index ce993ecd0c..21b9e43a5b 100644 --- a/flow/connectors/postgres/client.go +++ b/flow/connectors/postgres/client.go @@ -361,7 +361,7 @@ func (c *PostgresConnector) createSlotAndPublication( publication string, tableNameMapping map[string]model.NameAndExclude, doInitialCopy bool, -) (SlotCreationResult, error) { +) (model.SetupReplicationResult, error) { // iterate through source tables and create publication, // expecting tablenames to be schema qualified if !s.PublicationExists { @@ -369,12 +369,12 @@ func (c *PostgresConnector) createSlotAndPublication( for srcTableName := range tableNameMapping { parsedSrcTableName, err := utils.ParseSchemaTable(srcTableName) if err != nil { - return SlotCreationResult{}, fmt.Errorf("[publication-creation] source table identifier %s is invalid", srcTableName) + return model.SetupReplicationResult{}, fmt.Errorf("[publication-creation] source table identifier %s is invalid", srcTableName) } srcTableNames = append(srcTableNames, parsedSrcTableName.String()) } if err := c.CreatePublication(ctx, srcTableNames, publication); err != nil { - return SlotCreationResult{}, err + return model.SetupReplicationResult{}, err } } @@ -382,7 +382,7 @@ func (c *PostgresConnector) createSlotAndPublication( if !s.SlotExists { conn, err := c.CreateReplConn(ctx) if err != nil { - return SlotCreationResult{}, fmt.Errorf("[slot] error acquiring connection: %w", err) + return model.SetupReplicationResult{}, fmt.Errorf("[slot] error acquiring connection: %w", err) } c.logger.Warn(fmt.Sprintf("Creating replication slot '%s'", slot)) @@ -390,12 +390,12 @@ func (c *PostgresConnector) createSlotAndPublication( // THIS IS NOT IN A TX! if _, err := conn.Exec(ctx, "SET idle_in_transaction_session_timeout=0"); err != nil { conn.Close(ctx) - return SlotCreationResult{}, fmt.Errorf("[slot] error setting idle_in_transaction_session_timeout: %w", err) + return model.SetupReplicationResult{}, fmt.Errorf("[slot] error setting idle_in_transaction_session_timeout: %w", err) } if _, err := conn.Exec(ctx, "SET lock_timeout=0"); err != nil { conn.Close(ctx) - return SlotCreationResult{}, fmt.Errorf("[slot] error setting lock_timeout: %w", err) + return model.SetupReplicationResult{}, fmt.Errorf("[slot] error setting lock_timeout: %w", err) } opts := pglogrepl.CreateReplicationSlotOptions{ @@ -405,17 +405,17 @@ func (c *PostgresConnector) createSlotAndPublication( res, err := pglogrepl.CreateReplicationSlot(ctx, conn.PgConn(), slot, "pgoutput", opts) if err != nil { conn.Close(ctx) - return SlotCreationResult{}, fmt.Errorf("[slot] error creating replication slot: %w", err) + return model.SetupReplicationResult{}, fmt.Errorf("[slot] error creating replication slot: %w", err) } pgversion, err := c.MajorVersion(ctx) if err != nil { conn.Close(ctx) - return SlotCreationResult{}, fmt.Errorf("[slot] error getting PG version: %w", err) + return model.SetupReplicationResult{}, fmt.Errorf("[slot] error getting PG version: %w", err) } c.logger.Info(fmt.Sprintf("Created replication slot '%s'", slot)) - return SlotCreationResult{ + return model.SetupReplicationResult{ SlotName: res.SlotName, SnapshotName: res.SnapshotName, SupportsTIDScans: pgversion >= shared.POSTGRES_13, @@ -426,11 +426,7 @@ func (c *PostgresConnector) createSlotAndPublication( if doInitialCopy { err = ErrSlotAlreadyExists } - return SlotCreationResult{ - SlotName: slot, - SnapshotName: "", - SupportsTIDScans: false, - }, err + return model.SetupReplicationResult{SlotName: slot}, err } } diff --git a/flow/connectors/postgres/postgres.go b/flow/connectors/postgres/postgres.go index 5a56005222..56a7021cd7 100644 --- a/flow/connectors/postgres/postgres.go +++ b/flow/connectors/postgres/postgres.go @@ -57,13 +57,6 @@ type ReplState struct { LastOffset atomic.Int64 } -type SlotCreationResult struct { - Conn *pgx.Conn - SlotName string - SnapshotName string - SupportsTIDScans bool -} - func NewPostgresConnector(ctx context.Context, env map[string]string, pgConfig *protos.PostgresConfig) (*PostgresConnector, error) { logger := shared.LoggerFromCtx(ctx) flowNameInApplicationName, err := peerdbenv.PeerDBApplicationNamePerMirrorName(ctx, nil) @@ -1099,9 +1092,9 @@ func (c *PostgresConnector) FinishExport(tx any) error { func (c *PostgresConnector) SetupReplication( ctx context.Context, req *protos.SetupReplicationInput, -) (SlotCreationResult, error) { +) (model.SetupReplicationResult, error) { if !shared.IsValidReplicationName(req.FlowJobName) { - return SlotCreationResult{}, fmt.Errorf("invalid flow job name: `%s`, it should be ^[a-z_][a-z0-9_]*$", req.FlowJobName) + return model.SetupReplicationResult{}, fmt.Errorf("invalid flow job name: `%s`, it should be ^[a-z_][a-z0-9_]*$", req.FlowJobName) } // Slotname would be the job name prefixed with "peerflow_slot_" @@ -1118,7 +1111,7 @@ func (c *PostgresConnector) SetupReplication( // Check if the replication slot and publication exist exists, err := c.checkSlotAndPublication(ctx, slotName, publicationName) if err != nil { - return SlotCreationResult{}, err + return model.SetupReplicationResult{}, err } tableNameMapping := make(map[string]model.NameAndExclude, len(req.TableNameMapping)) diff --git a/flow/model/model.go b/flow/model/model.go index f42b069ea8..f474a56aeb 100644 --- a/flow/model/model.go +++ b/flow/model/model.go @@ -1,6 +1,7 @@ package model import ( + "context" "crypto/sha256" "fmt" "sync/atomic" @@ -180,3 +181,10 @@ type SyncCompositeResponse struct { SyncResponse *SyncResponse NeedsNormalize bool } + +type SetupReplicationResult struct { + Conn interface{ Close(context.Context) error } + SlotName string + SnapshotName string + SupportsTIDScans bool +} From 8511f6c0f2ad5001e9cd3484d4cc8aeb0730c226 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 02:37:47 +0000 Subject: [PATCH 72/80] fix --- flow/e2e/postgres/qrep_flow_pg_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flow/e2e/postgres/qrep_flow_pg_test.go b/flow/e2e/postgres/qrep_flow_pg_test.go index 32bd78b116..a9df50fbbd 100644 --- a/flow/e2e/postgres/qrep_flow_pg_test.go +++ b/flow/e2e/postgres/qrep_flow_pg_test.go @@ -163,9 +163,9 @@ func (s PeerFlowE2ETestSuitePG) TestSimpleSlotCreation() { require.NoError(s.t, err) s.t.Logf("slot creation complete: %v", slotInfo) - require.NoError(s.t, slotInfo.Conn.Close(context.Background())) - - s.t.Logf("successfully setup replication: %s", flowJobName) + if slotInfo.Conn != nil { + require.NoError(s.t, slotInfo.Conn.Close(context.Background())) + } } func (s PeerFlowE2ETestSuitePG) Test_Complete_QRep_Flow_Multi_Insert_PG() { From e884f6b54d8613a37ca0887bc5a8a0fb3ba1662c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 02:51:53 +0000 Subject: [PATCH 73/80] need to init Conn --- flow/connectors/postgres/client.go | 1 + 1 file changed, 1 insertion(+) diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go index 21b9e43a5b..9da38f9c7d 100644 --- a/flow/connectors/postgres/client.go +++ b/flow/connectors/postgres/client.go @@ -416,6 +416,7 @@ func (c *PostgresConnector) createSlotAndPublication( c.logger.Info(fmt.Sprintf("Created replication slot '%s'", slot)) return model.SetupReplicationResult{ + Conn: conn, SlotName: res.SlotName, SnapshotName: res.SnapshotName, SupportsTIDScans: pgversion >= shared.POSTGRES_13, From 8e7c85b5a0aaaec338fef877388c0c621344c83d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 14:31:58 +0000 Subject: [PATCH 74/80] fix generic, remove GeneratePostgresFlowConnectionConfigs --- flow/e2e/bigquery/peer_flow_bq_test.go | 36 ++++++++++----------- flow/e2e/clickhouse/peer_flow_ch_test.go | 24 +++++++------- flow/e2e/congen.go | 36 ++------------------- flow/e2e/elasticsearch/peer_flow_es_test.go | 4 +-- flow/e2e/eventhub/peer_flow_eh_test.go | 2 +- flow/e2e/generic/generic_test.go | 7 ++-- flow/e2e/kafka/kafka_test.go | 8 ++--- flow/e2e/postgres/peer_flow_pg_test.go | 24 +++++++------- flow/e2e/pubsub/pubsub_test.go | 6 ++-- flow/e2e/s3/cdc_s3_test.go | 2 +- flow/e2e/snowflake/peer_flow_sf_test.go | 26 +++++++-------- 11 files changed, 72 insertions(+), 103 deletions(-) diff --git a/flow/e2e/bigquery/peer_flow_bq_test.go b/flow/e2e/bigquery/peer_flow_bq_test.go index 6ffc70fa68..c77038867a 100644 --- a/flow/e2e/bigquery/peer_flow_bq_test.go +++ b/flow/e2e/bigquery/peer_flow_bq_test.go @@ -119,7 +119,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Complete_Flow_No_Data() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 1 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -150,7 +150,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Char_ColType_Error() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 1 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -182,7 +182,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Toast_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -233,7 +233,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Toast_Advance_1_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -289,7 +289,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Toast_Advance_2_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -339,7 +339,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Toast_Advance_3_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -395,7 +395,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Types_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -477,7 +477,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_NaN_Doubles_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -529,7 +529,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Invalid_Geo_BQ_Avro_CDC() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -605,7 +605,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Multi_Table_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -659,7 +659,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Simple_Schema_Changes_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -737,7 +737,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_All_Types_Schema_Changes_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -805,7 +805,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Composite_PKey_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -861,7 +861,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Composite_PKey_Toast_1_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 flowConnConfig.SoftDeleteColName = "" flowConnConfig.SyncedAtColName = "" @@ -921,7 +921,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Composite_PKey_Toast_2_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -972,7 +972,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Columns_BQ() { SoftDelete: true, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -1022,7 +1022,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_Multi_Table_Multi_Dataset_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -1362,7 +1362,7 @@ func (s PeerFlowE2ETestSuiteBQ) Test_JSON_PKey_BQ() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 flowConnConfig.SoftDeleteColName = "" flowConnConfig.SyncedAtColName = "" diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index 7d55a5d408..650f4f1150 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -75,7 +75,7 @@ func (s ClickHouseSuite) Test_Addition_Removal() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 1 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -211,7 +211,7 @@ func (s ClickHouseSuite) Test_NullableMirrorSetting() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_NULLABLE": "true"} @@ -258,7 +258,7 @@ func (s ClickHouseSuite) Test_NullableColumnSetting() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true for _, tm := range flowConnConfig.TableMappings { tm.Columns = []*protos.ColumnSetting{ @@ -309,7 +309,7 @@ func (s ClickHouseSuite) Test_Date32() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) @@ -347,7 +347,7 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Disabled() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_CLICKHOUSE_ENABLE_PRIMARY_UPDATE": "false"} @@ -388,7 +388,7 @@ func (s ClickHouseSuite) Test_Update_PKey_Env_Enabled() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_CLICKHOUSE_ENABLE_PRIMARY_UPDATE": "true"} @@ -425,7 +425,7 @@ func (s ClickHouseSuite) Test_Replident_Full_Unchanged_TOAST_Updates() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -470,7 +470,7 @@ func (s ClickHouseSuite) WeirdTable(tableName string) { TableNameMapping: map[string]string{s.attachSchemaSuffix(tableName): dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -554,7 +554,7 @@ func (s ClickHouseSuite) Test_Large_Numeric() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) @@ -610,7 +610,7 @@ func (s ClickHouseSuite) testNumericFF(ffValue bool) { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_CLICKHOUSE_UNBOUNDED_NUMERIC_AS_STRING": strconv.FormatBool(ffValue)} tc := e2e.NewTemporalClient(s.t) @@ -674,7 +674,7 @@ func (s ClickHouseSuite) testBinaryFormat(format string, expected string) { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true flowConnConfig.Env = map[string]string{"PEERDB_CLICKHOUSE_BINARY_FORMAT": format} tc := e2e.NewTemporalClient(s.t) @@ -756,7 +756,7 @@ func (s ClickHouseSuite) Test_Types_CH() { TableNameMapping: map[string]string{srcFullName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s.t, s.source) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) diff --git a/flow/e2e/congen.go b/flow/e2e/congen.go index efa5a349a8..35e5d6fb6a 100644 --- a/flow/e2e/congen.go +++ b/flow/e2e/congen.go @@ -55,10 +55,8 @@ type FlowConnectionGenerationConfig struct { SoftDelete bool } -func (c *FlowConnectionGenerationConfig) GenerateFlowConnectionConfigs( - t *testing.T, - source SuiteSource, -) *protos.FlowConnectionConfigs { +func (c *FlowConnectionGenerationConfig) GenerateFlowConnectionConfigs(s Suite) *protos.FlowConnectionConfigs { + t := s.T() t.Helper() tblMappings := c.TableMappings if tblMappings == nil { @@ -73,35 +71,7 @@ func (c *FlowConnectionGenerationConfig) GenerateFlowConnectionConfigs( ret := &protos.FlowConnectionConfigs{ FlowJobName: c.FlowJobName, TableMappings: tblMappings, - SourceName: source.GeneratePeer(t).Name, - DestinationName: c.Destination, - SyncedAtColName: "_PEERDB_SYNCED_AT", - IdleTimeoutSeconds: 15, - } - if c.SoftDelete { - ret.SoftDeleteColName = "_PEERDB_IS_DELETED" - } - return ret -} - -func (c *FlowConnectionGenerationConfig) GeneratePostgresFlowConnectionConfigs( - t *testing.T, -) *protos.FlowConnectionConfigs { - t.Helper() - tblMappings := c.TableMappings - if tblMappings == nil { - for k, v := range c.TableNameMapping { - tblMappings = append(tblMappings, &protos.TableMapping{ - SourceTableIdentifier: k, - DestinationTableIdentifier: v, - }) - } - } - - ret := &protos.FlowConnectionConfigs{ - FlowJobName: c.FlowJobName, - TableMappings: tblMappings, - SourceName: GeneratePostgresPeer(t).Name, + SourceName: s.Source().GeneratePeer(t).Name, DestinationName: c.Destination, SyncedAtColName: "_PEERDB_SYNCED_AT", IdleTimeoutSeconds: 15, diff --git a/flow/e2e/elasticsearch/peer_flow_es_test.go b/flow/e2e/elasticsearch/peer_flow_es_test.go index 07aff0d918..e3def2c50b 100644 --- a/flow/e2e/elasticsearch/peer_flow_es_test.go +++ b/flow/e2e/elasticsearch/peer_flow_es_test.go @@ -30,7 +30,7 @@ func (s elasticsearchSuite) Test_Simple_PKey_CDC_Mirror() { TableNameMapping: map[string]string{srcTableName: srcTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 flowConnConfig.DoInitialSnapshot = true @@ -99,7 +99,7 @@ func (s elasticsearchSuite) Test_Composite_PKey_CDC_Mirror() { TableNameMapping: map[string]string{srcTableName: srcTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 flowConnConfig.DoInitialSnapshot = true diff --git a/flow/e2e/eventhub/peer_flow_eh_test.go b/flow/e2e/eventhub/peer_flow_eh_test.go index 67a2c09a6c..6a3489d1f1 100644 --- a/flow/e2e/eventhub/peer_flow_eh_test.go +++ b/flow/e2e/eventhub/peer_flow_eh_test.go @@ -154,7 +154,7 @@ func (s EventhubsSuite) Test_EH_Simple() { TableNameMapping: map[string]string{srcTableName: scopedEventhubName}, Destination: destinationPeer.Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.Script = "e2e_eh_simple_script" tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) diff --git a/flow/e2e/generic/generic_test.go b/flow/e2e/generic/generic_test.go index b35cae86ef..097e720cf1 100644 --- a/flow/e2e/generic/generic_test.go +++ b/flow/e2e/generic/generic_test.go @@ -79,7 +79,7 @@ func (s Generic) Test_Simple_Flow() { TableMappings: e2e.TableMappings(s, srcTable, dstTable), Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) tc := e2e.NewTemporalClient(t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -126,8 +126,7 @@ func (s Generic) Test_Simple_Schema_Changes() { TableMappings: e2e.TableMappings(s, srcTable, dstTable), Destination: s.Peer().Name, } - - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) // wait for PeerFlowStatusQuery to finish setup // and then insert and mutate schema repeatedly. @@ -341,7 +340,7 @@ func (s Generic) Test_Partitioned_Table() { TableMappings: e2e.TableMappings(s, srcTable, dstTable), Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) tc := e2e.NewTemporalClient(t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) diff --git a/flow/e2e/kafka/kafka_test.go b/flow/e2e/kafka/kafka_test.go index 041d62fdad..7b7f705b6b 100644 --- a/flow/e2e/kafka/kafka_test.go +++ b/flow/e2e/kafka/kafka_test.go @@ -107,7 +107,7 @@ func (s KafkaSuite) TestSimple() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.Script = "e2e_kasimple" tc := e2e.NewTemporalClient(s.t) @@ -166,7 +166,7 @@ func (s KafkaSuite) TestMessage() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.Script = "e2e_kamessage" tc := e2e.NewTemporalClient(s.t) @@ -221,7 +221,7 @@ func (s KafkaSuite) TestDefault() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) tc := e2e.NewTemporalClient(s.t) env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -276,7 +276,7 @@ func (s KafkaSuite) TestInitialLoad() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true _, err = s.Conn().Exec(context.Background(), fmt.Sprintf(` diff --git a/flow/e2e/postgres/peer_flow_pg_test.go b/flow/e2e/postgres/peer_flow_pg_test.go index 85ecae24a3..ce306fb13f 100644 --- a/flow/e2e/postgres/peer_flow_pg_test.go +++ b/flow/e2e/postgres/peer_flow_pg_test.go @@ -67,7 +67,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Geospatial_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 tc := e2e.NewTemporalClient(s.t) @@ -112,7 +112,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Types_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 flowConnConfig.SoftDeleteColName = "" flowConnConfig.SyncedAtColName = "" @@ -179,7 +179,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Enums_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -221,7 +221,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Composite_PKey_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -283,7 +283,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Composite_PKey_Toast_1_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -348,7 +348,7 @@ func (s PeerFlowE2ETestSuitePG) Test_Composite_PKey_Toast_2_PG() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -406,7 +406,7 @@ func (s PeerFlowE2ETestSuitePG) Test_PeerDB_Columns() { SoftDelete: true, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) @@ -825,7 +825,7 @@ func (s PeerFlowE2ETestSuitePG) Test_ContinueAsNew() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 2 flowConnConfig.IdleTimeoutSeconds = 10 @@ -972,7 +972,7 @@ func (s PeerFlowE2ETestSuitePG) Test_CustomSync() { TableNameMapping: map[string]string{srcTableName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) _, err := s.Conn().Exec(context.Background(), fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( @@ -1040,7 +1040,7 @@ func (s PeerFlowE2ETestSuitePG) Test_TypeSystem_PG() { TableNameMapping: map[string]string{srcTableName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true flowConnConfig.System = protos.TypeSystem_PG flowConnConfig.SoftDeleteColName = "" @@ -1089,7 +1089,7 @@ func (s PeerFlowE2ETestSuitePG) Test_TransformRecordScript() { TableNameMapping: map[string]string{srcTableName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.Script = "cdc_transform_record" tc := e2e.NewTemporalClient(s.t) @@ -1138,7 +1138,7 @@ func (s PeerFlowE2ETestSuitePG) Test_TransformRowScript() { TableNameMapping: map[string]string{srcTableName: dstTableName}, Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.Script = "cdc_transform_row" tc := e2e.NewTemporalClient(s.t) diff --git a/flow/e2e/pubsub/pubsub_test.go b/flow/e2e/pubsub/pubsub_test.go index 258b0ec296..2c320c63bc 100644 --- a/flow/e2e/pubsub/pubsub_test.go +++ b/flow/e2e/pubsub/pubsub_test.go @@ -145,7 +145,7 @@ func (s PubSubSuite) TestCreateTopic() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer(sa).Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.Script = "e2e_pscreate" tc := e2e.NewTemporalClient(s.t) @@ -197,7 +197,7 @@ func (s PubSubSuite) TestSimple() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer(sa).Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.Script = "e2e_pssimple" psclient, err := sa.CreatePubSubClient(context.Background()) @@ -267,7 +267,7 @@ func (s PubSubSuite) TestInitialLoad() { TableNameMapping: map[string]string{srcTableName: flowName}, Destination: s.Peer(sa).Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.Script = "e2e_psinitial" flowConnConfig.DoInitialSnapshot = true diff --git a/flow/e2e/s3/cdc_s3_test.go b/flow/e2e/s3/cdc_s3_test.go index 48fcfbf40e..b3945c7f19 100644 --- a/flow/e2e/s3/cdc_s3_test.go +++ b/flow/e2e/s3/cdc_s3_test.go @@ -39,7 +39,7 @@ func (s PeerFlowE2ETestSuiteS3) Test_Complete_Simple_Flow_S3() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 5 env := e2e.ExecutePeerflow(tc, peerflow.CDCFlowWorkflow, flowConnConfig, nil) diff --git a/flow/e2e/snowflake/peer_flow_sf_test.go b/flow/e2e/snowflake/peer_flow_sf_test.go index a74117bbad..33a79899ed 100644 --- a/flow/e2e/snowflake/peer_flow_sf_test.go +++ b/flow/e2e/snowflake/peer_flow_sf_test.go @@ -58,7 +58,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Flow_ReplicaIdentity_Index_No_Pkey() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -111,7 +111,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Invalid_Numeric() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.DoInitialSnapshot = true tc := e2e.NewTemporalClient(s.t) @@ -165,7 +165,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Invalid_Geo_SF_Avro_CDC() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) // wait for PeerFlowStatusQuery to finish setup // and then insert 10 rows into the source table @@ -255,7 +255,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Toast_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -306,7 +306,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Toast_Advance_1_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -362,7 +362,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Toast_Advance_2_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -413,7 +413,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Toast_Advance_3_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -471,7 +471,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Types_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -566,7 +566,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Multi_Table_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -621,7 +621,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Composite_PKey_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -676,7 +676,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Composite_PKey_Toast_1_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 flowConnConfig.SoftDeleteColName = "" flowConnConfig.SyncedAtColName = "" @@ -738,7 +738,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Composite_PKey_Toast_2_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup @@ -1145,7 +1145,7 @@ func (s PeerFlowE2ETestSuiteSF) Test_Supported_Mixed_Case_Table_SF() { Destination: s.Peer().Name, } - flowConnConfig := connectionGen.GeneratePostgresFlowConnectionConfigs(s.t) + flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s) flowConnConfig.MaxBatchSize = 100 // wait for PeerFlowStatusQuery to finish setup From 7e0c7dd009908838df350efea9709f69fe0dbcba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 19:01:19 +0000 Subject: [PATCH 75/80] mysql logo --- ui/components/PeerComponent.tsx | 5 ++++- ui/components/PeerTypeComponent.tsx | 2 ++ ui/public/svgs/mysql.svg | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 ui/public/svgs/mysql.svg diff --git a/ui/components/PeerComponent.tsx b/ui/components/PeerComponent.tsx index aeef4926cf..26ebbdf8b3 100644 --- a/ui/components/PeerComponent.tsx +++ b/ui/components/PeerComponent.tsx @@ -25,6 +25,9 @@ export const DBTypeToImageMapping = (peerType: DBType | string) => { case DBType.POSTGRES: case 'POSTGRES': return '/svgs/pg.svg'; + case DBType.MYSQL: + case 'MYSQL': + return '/svgs/mysql.svg'; case DBType.SNOWFLAKE: case 'SNOWFLAKE': return '/svgs/sf.svg'; @@ -34,8 +37,8 @@ export const DBTypeToImageMapping = (peerType: DBType | string) => { case DBType.S3: case 'S3': return '/svgs/aws.svg'; - case 'CLICKHOUSE': case DBType.CLICKHOUSE: + case 'CLICKHOUSE': return '/svgs/ch.svg'; case DBType.EVENTHUBS: return '/svgs/ms.svg'; diff --git a/ui/components/PeerTypeComponent.tsx b/ui/components/PeerTypeComponent.tsx index 26f9e5f7f2..fe155eeadf 100644 --- a/ui/components/PeerTypeComponent.tsx +++ b/ui/components/PeerTypeComponent.tsx @@ -8,6 +8,8 @@ export const DBTypeToGoodText = (ptype?: DBType) => { switch (dBTypeFromJSON(ptype)) { case DBType.POSTGRES: return 'PostgreSQL'; + case DBType.MYSQL: + return 'MySQL'; case DBType.SNOWFLAKE: return 'Snowflake'; case DBType.EVENTHUBS: diff --git a/ui/public/svgs/mysql.svg b/ui/public/svgs/mysql.svg new file mode 100644 index 0000000000..be18116ce9 --- /dev/null +++ b/ui/public/svgs/mysql.svg @@ -0,0 +1 @@ + \ No newline at end of file From ebf67f1fd96cf491c4c1548165006c227887e19b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 22:29:49 +0000 Subject: [PATCH 76/80] no need for param --- flow/connectors/external_metadata/store.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index 77a02792b8..387389be89 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -133,13 +133,13 @@ func (p *PostgresMetadata) SetLastOffset(ctx context.Context, jobName string, of p.logger.Debug("updating last offset", slog.String("offsetID", pglogrepl.LSN(offset.ID).String()), slog.String("offsetText", offset.Text)) if _, err := p.pool.Exec(ctx, ` INSERT INTO `+lastSyncStateTableName+` (job_name, last_offset, last_text, sync_batch_id) - VALUES ($1, $2, $3, $4) + VALUES ($1, $2, $3, 0) ON CONFLICT (job_name) DO UPDATE SET last_offset = GREATEST(`+lastSyncStateTableName+`.last_offset, excluded.last_offset), last_text = excluded.last_text, updated_at = NOW() - `, jobName, offset.ID, offset.Text, 0); err != nil { + `, jobName, offset.ID, offset.Text); err != nil { p.logger.Error("failed to update last offset", "error", err) return err } From 22e1106d512c6084024bd3e6940b0b748de907c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 22:44:48 +0000 Subject: [PATCH 77/80] mysql does not defer GetLastOffset to destination --- flow/activities/flowable_core.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/flow/activities/flowable_core.go b/flow/activities/flowable_core.go index ef4df798e8..976d6d491b 100644 --- a/flow/activities/flowable_core.go +++ b/flow/activities/flowable_core.go @@ -21,6 +21,7 @@ import ( "google.golang.org/protobuf/proto" "github.com/PeerDB-io/peerdb/flow/connectors" + "github.com/PeerDB-io/peerdb/flow/connectors/mysql" connpostgres "github.com/PeerDB-io/peerdb/flow/connectors/postgres" "github.com/PeerDB-io/peerdb/flow/connectors/utils/monitoring" "github.com/PeerDB-io/peerdb/flow/generated/protos" @@ -134,13 +135,17 @@ func syncCore[TPull connectors.CDCPullConnectorCore, TSync connectors.CDCSyncCon } lastOffset, err := func() (model.CdcCheckpoint, error) { - dstConn, err := connectors.GetByNameAs[TSync](ctx, config.Env, a.CatalogPool, config.DestinationName) - if err != nil { - return model.CdcCheckpoint{}, fmt.Errorf("failed to get destination connector: %w", err) - } - defer connectors.CloseConnector(ctx, dstConn) + if myConn, isMy := any(srcConn).(*connmysql.MySqlConnector); isMy { + return myConn.GetLastOffset(ctx, config.FlowJobName) + } else { + dstConn, err := connectors.GetByNameAs[TSync](ctx, config.Env, a.CatalogPool, config.DestinationName) + if err != nil { + return model.CdcCheckpoint{}, fmt.Errorf("failed to get destination connector: %w", err) + } + defer connectors.CloseConnector(ctx, dstConn) - return dstConn.GetLastOffset(ctx, config.FlowJobName) + return dstConn.GetLastOffset(ctx, config.FlowJobName) + } }() if err != nil { a.Alerter.LogFlowError(ctx, flowName, err) From 3ab0360053a231bfe601b41113383d80be6397c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Fri, 17 Jan 2025 23:24:41 +0000 Subject: [PATCH 78/80] track position more accurately, avoid clearing it --- flow/connectors/mysql/cdc.go | 38 ++++-- .../connectors/utils/monitoring/monitoring.go | 123 +++++++++--------- 2 files changed, 84 insertions(+), 77 deletions(-) diff --git a/flow/connectors/mysql/cdc.go b/flow/connectors/mysql/cdc.go index 7e5652d4a2..2371069767 100644 --- a/flow/connectors/mysql/cdc.go +++ b/flow/connectors/mysql/cdc.go @@ -163,47 +163,49 @@ func (c *MySqlConnector) startSyncer() *replication.BinlogSyncer { }) } -func (c *MySqlConnector) startStreaming(pos string) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, error) { +func (c *MySqlConnector) startStreaming( + pos string, +) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, mysql.Position, error) { if rest, isFile := strings.CutPrefix(pos, "!f:"); isFile { comma := strings.LastIndexByte(rest, ',') if comma == -1 { - return nil, nil, nil, fmt.Errorf("no comma in file/pos offset %s", pos) + return nil, nil, nil, mysql.Position{}, fmt.Errorf("no comma in file/pos offset %s", pos) } offset, err := strconv.ParseUint(rest[comma+1:], 16, 32) if err != nil { - return nil, nil, nil, fmt.Errorf("invalid offset in filepos offset %s: %w", pos, err) + return nil, nil, nil, mysql.Position{}, fmt.Errorf("invalid offset in file/pos offset %s: %w", pos, err) } - return c.startCdcStreamingFilePos(rest[:comma], uint32(offset)) + return c.startCdcStreamingFilePos(mysql.Position{Name: rest[:comma], Pos: uint32(offset)}) } else { gset, err := mysql.ParseGTIDSet(c.config.Flavor, pos) if err != nil { - return nil, nil, nil, err + return nil, nil, nil, mysql.Position{}, err } return c.startCdcStreamingGtid(gset) } } func (c *MySqlConnector) startCdcStreamingFilePos( - lastOffsetName string, lastOffsetPos uint32, -) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, error) { + pos mysql.Position, +) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, mysql.Position, error) { syncer := c.startSyncer() - stream, err := syncer.StartSync(mysql.Position{Name: lastOffsetName, Pos: lastOffsetPos}) + stream, err := syncer.StartSync(pos) if err != nil { syncer.Close() } - return syncer, stream, nil, err + return syncer, stream, nil, pos, err } func (c *MySqlConnector) startCdcStreamingGtid( gset mysql.GTIDSet, -) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, error) { +) (*replication.BinlogSyncer, *replication.BinlogStreamer, mysql.GTIDSet, mysql.Position, error) { // https://hevodata.com/learn/mysql-gtids-and-replication-set-up syncer := c.startSyncer() stream, err := syncer.StartSyncGTID(gset) if err != nil { syncer.Close() } - return syncer, stream, gset, err + return syncer, stream, gset, mysql.Position{}, err } func (c *MySqlConnector) ReplPing(context.Context) error { @@ -249,12 +251,16 @@ func (c *MySqlConnector) PullRecords( ) error { defer req.RecordStream.Close() - syncer, mystream, gset, err := c.startStreaming(req.LastOffset.Text) + syncer, mystream, gset, pos, err := c.startStreaming(req.LastOffset.Text) if err != nil { return err } defer syncer.Close() + if gset == nil { + req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%x", pos.Name, pos.Pos)) + } + var fetchedBytesCounter metric.Int64Counter if otelManager != nil { var err error @@ -285,11 +291,17 @@ func (c *MySqlConnector) PullRecords( } // TODO if gset == nil update pos with event.Header.LogPos + if gset == nil && event.Header.LogPos > 0 { + pos.Pos = max(pos.Pos, event.Header.LogPos) + req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%x", pos.Name, pos.Pos)) + } switch ev := event.Event.(type) { case *replication.RotateEvent: if gset == nil && event.Header.Timestamp != 0 { - req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%x", string(ev.NextLogName), ev.Position)) + pos.Name = string(ev.NextLogName) + pos.Pos = uint32(ev.Position) + req.RecordStream.UpdateLatestCheckpointText(fmt.Sprintf("!f:%s,%x", pos.Name, pos.Pos)) } case *replication.MariadbGTIDEvent: if gset != nil { diff --git a/flow/connectors/utils/monitoring/monitoring.go b/flow/connectors/utils/monitoring/monitoring.go index c33bdc6f0f..372b99765b 100644 --- a/flow/connectors/utils/monitoring/monitoring.go +++ b/flow/connectors/utils/monitoring/monitoring.go @@ -24,10 +24,10 @@ type CDCBatchInfo struct { } func InitializeCDCFlow(ctx context.Context, pool *pgxpool.Pool, flowJobName string) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, `INSERT INTO peerdb_stats.cdc_flows(flow_name,latest_lsn_at_source,latest_lsn_at_target) VALUES($1,0,0) - ON CONFLICT DO NOTHING`, flowJobName) - if err != nil { + ON CONFLICT DO NOTHING`, flowJobName, + ); err != nil { return fmt.Errorf("error while inserting flow into cdc_flows: %w", err) } return nil @@ -36,10 +36,10 @@ func InitializeCDCFlow(ctx context.Context, pool *pgxpool.Pool, flowJobName stri func UpdateLatestLSNAtSourceForCDCFlow(ctx context.Context, pool *pgxpool.Pool, flowJobName string, latestLSNAtSource int64, ) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, "UPDATE peerdb_stats.cdc_flows SET latest_lsn_at_source=$1 WHERE flow_name=$2", - uint64(latestLSNAtSource), flowJobName) - if err != nil { + uint64(latestLSNAtSource), flowJobName, + ); err != nil { return fmt.Errorf("[source] error while updating flow in cdc_flows: %w", err) } return nil @@ -48,10 +48,10 @@ func UpdateLatestLSNAtSourceForCDCFlow(ctx context.Context, pool *pgxpool.Pool, func UpdateLatestLSNAtTargetForCDCFlow(ctx context.Context, pool *pgxpool.Pool, flowJobName string, latestLSNAtTarget int64, ) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, "UPDATE peerdb_stats.cdc_flows SET latest_lsn_at_target=$1 WHERE flow_name=$2", - uint64(latestLSNAtTarget), flowJobName) - if err != nil { + uint64(latestLSNAtTarget), flowJobName, + ); err != nil { return fmt.Errorf("[target] error while updating flow in cdc_flows: %w", err) } return nil @@ -60,12 +60,12 @@ func UpdateLatestLSNAtTargetForCDCFlow(ctx context.Context, pool *pgxpool.Pool, func AddCDCBatchForFlow(ctx context.Context, pool *pgxpool.Pool, flowJobName string, batchInfo CDCBatchInfo, ) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, `INSERT INTO peerdb_stats.cdc_batches(flow_name,batch_id,rows_in_batch,batch_start_lsn,batch_end_lsn, start_time) VALUES($1,$2,$3,$4,$5,$6) ON CONFLICT DO NOTHING`, flowJobName, batchInfo.BatchID, batchInfo.RowsInBatch, 0, - uint64(batchInfo.BatchEndlSN), batchInfo.StartTime) - if err != nil { + uint64(batchInfo.BatchEndlSN), batchInfo.StartTime, + ); err != nil { return fmt.Errorf("error while inserting batch into cdc_batch: %w", err) } return nil @@ -80,10 +80,10 @@ func UpdateNumRowsAndEndLSNForCDCBatch( numRows uint32, batchEndCheckpoint model.CdcCheckpoint, ) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, "UPDATE peerdb_stats.cdc_batches SET rows_in_batch=$1,batch_end_lsn=$2,batch_end_lsn_text=$3 WHERE flow_name=$4 AND batch_id=$5", - numRows, uint64(batchEndCheckpoint.ID), batchEndCheckpoint.Text, flowJobName, batchID) - if err != nil { + numRows, uint64(batchEndCheckpoint.ID), batchEndCheckpoint.Text, flowJobName, batchID, + ); err != nil { return fmt.Errorf("error while updating batch in cdc_batch: %w", err) } return nil @@ -95,12 +95,12 @@ func UpdateEndTimeForCDCBatch( flowJobName string, batchID int64, ) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, `UPDATE peerdb_stats.cdc_batches SET end_time = NOW() WHERE flow_name = $1 AND batch_id <= $2 AND end_time IS NULL`, - flowJobName, batchID) - if err != nil { + flowJobName, batchID, + ); err != nil { return fmt.Errorf("error while updating batch in cdc_batch: %w", err) } return nil @@ -114,8 +114,7 @@ func AddCDCBatchTablesForFlow(ctx context.Context, pool *pgxpool.Pool, flowJobNa return fmt.Errorf("error while beginning transaction for inserting statistics into cdc_batch_table: %w", err) } defer func() { - err = insertBatchTablesTx.Rollback(context.Background()) - if err != pgx.ErrTxClosed && err != nil { + if err := insertBatchTablesTx.Rollback(context.Background()); err != pgx.ErrTxClosed && err != nil { shared.LoggerFromCtx(ctx).Error("error during transaction rollback", slog.Any("error", err), slog.String(string(shared.FlowNameKey), flowJobName)) @@ -126,21 +125,19 @@ func AddCDCBatchTablesForFlow(ctx context.Context, pool *pgxpool.Pool, flowJobNa inserts := rowCounts.InsertCount.Load() updates := rowCounts.UpdateCount.Load() deletes := rowCounts.DeleteCount.Load() - _, err = insertBatchTablesTx.Exec(ctx, + if _, err := insertBatchTablesTx.Exec(ctx, `INSERT INTO peerdb_stats.cdc_batch_table (flow_name,batch_id,destination_table_name,num_rows, insert_count,update_count,delete_count) VALUES($1,$2,$3,$4,$5,$6,$7) ON CONFLICT DO NOTHING`, flowJobName, batchID, destinationTableName, - inserts+updates+deletes, inserts, updates, deletes) - if err != nil { + inserts+updates+deletes, inserts, updates, deletes, + ); err != nil { return fmt.Errorf("error while inserting statistics into cdc_batch_table: %w", err) } } - err = insertBatchTablesTx.Commit(ctx) - if err != nil { - return fmt.Errorf("error while committing transaction for inserting statistics into cdc_batch_table: %w", - err) + if err := insertBatchTablesTx.Commit(ctx); err != nil { + return fmt.Errorf("error while committing transaction for inserting statistics into cdc_batch_table: %w", err) } return nil } @@ -154,11 +151,11 @@ func InitializeQRepRun( parentMirrorName string, ) error { flowJobName := config.GetFlowJobName() - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, "INSERT INTO peerdb_stats.qrep_runs(flow_name,run_uuid,source_table,destination_table,parent_mirror_name)"+ " VALUES($1,$2,$3,$4,$5) ON CONFLICT DO NOTHING", - flowJobName, runUUID, config.WatermarkTable, config.DestinationTableIdentifier, parentMirrorName) - if err != nil { + flowJobName, runUUID, config.WatermarkTable, config.DestinationTableIdentifier, parentMirrorName, + ); err != nil { return fmt.Errorf("error while inserting qrep run in qrep_runs: %w", err) } @@ -172,10 +169,10 @@ func InitializeQRepRun( } func UpdateStartTimeForQRepRun(ctx context.Context, pool *pgxpool.Pool, runUUID string) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, "UPDATE peerdb_stats.qrep_runs SET start_time=$1, fetch_complete=true WHERE run_uuid=$2", - time.Now(), runUUID) - if err != nil { + time.Now(), runUUID, + ); err != nil { return fmt.Errorf("error while updating start time for run_uuid %s in qrep_runs: %w", runUUID, err) } @@ -183,10 +180,10 @@ func UpdateStartTimeForQRepRun(ctx context.Context, pool *pgxpool.Pool, runUUID } func UpdateEndTimeForQRepRun(ctx context.Context, pool *pgxpool.Pool, runUUID string) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, "UPDATE peerdb_stats.qrep_runs SET end_time=$1, consolidate_complete=true WHERE run_uuid=$2", - time.Now(), runUUID) - if err != nil { + time.Now(), runUUID, + ); err != nil { return fmt.Errorf("error while updating end time for run_uuid %s in qrep_runs: %w", runUUID, err) } @@ -199,7 +196,7 @@ func AppendSlotSizeInfo( peerName string, slotInfo *protos.SlotInfo, ) error { - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, "INSERT INTO peerdb_stats.peer_slot_size"+ "(peer_name, slot_name, restart_lsn, redo_lsn, confirmed_flush_lsn, slot_size, wal_status) "+ "VALUES($1,$2,$3,$4,$5,$6,$7) ON CONFLICT DO NOTHING;", @@ -210,8 +207,7 @@ func AppendSlotSizeInfo( slotInfo.ConfirmedFlushLSN, slotInfo.LagInMb, slotInfo.WalStatus, - ) - if err != nil { + ); err != nil { return fmt.Errorf("error while upserting row for slot_size: %w", err) } @@ -260,13 +256,13 @@ func addPartitionToQRepRun(ctx context.Context, pool *pgxpool.Pool, flowJobName return fmt.Errorf("unknown range type: %v", x) } - _, err := pool.Exec(ctx, + if _, err := pool.Exec(ctx, `INSERT INTO peerdb_stats.qrep_partitions (flow_name,run_uuid,partition_uuid,partition_start,partition_end,restart_count,parent_mirror_name) VALUES($1,$2,$3,$4,$5,$6,$7) ON CONFLICT(run_uuid,partition_uuid) DO UPDATE SET restart_count=qrep_partitions.restart_count+1`, - flowJobName, runUUID, partition.PartitionId, rangeStart, rangeEnd, 0, parentMirrorName) - if err != nil { + flowJobName, runUUID, partition.PartitionId, rangeStart, rangeEnd, 0, parentMirrorName, + ); err != nil { return fmt.Errorf("error while inserting qrep partition in qrep_partitions: %w", err) } @@ -280,9 +276,10 @@ func UpdateStartTimeForPartition( partition *protos.QRepPartition, startTime time.Time, ) error { - _, err := pool.Exec(ctx, `UPDATE peerdb_stats.qrep_partitions SET start_time=$1 - WHERE run_uuid=$2 AND partition_uuid=$3`, startTime, runUUID, partition.PartitionId) - if err != nil { + if _, err := pool.Exec(ctx, + `UPDATE peerdb_stats.qrep_partitions SET start_time=$1 WHERE run_uuid=$2 AND partition_uuid=$3`, + startTime, runUUID, partition.PartitionId, + ); err != nil { return fmt.Errorf("error while updating qrep partition in qrep_partitions: %w", err) } return nil @@ -291,9 +288,10 @@ func UpdateStartTimeForPartition( func UpdatePullEndTimeAndRowsForPartition(ctx context.Context, pool *pgxpool.Pool, runUUID string, partition *protos.QRepPartition, rowsInPartition int64, ) error { - _, err := pool.Exec(ctx, `UPDATE peerdb_stats.qrep_partitions SET pull_end_time=$1,rows_in_partition=$2 - WHERE run_uuid=$3 AND partition_uuid=$4`, time.Now(), rowsInPartition, runUUID, partition.PartitionId) - if err != nil { + if _, err := pool.Exec(ctx, + `UPDATE peerdb_stats.qrep_partitions SET pull_end_time=$1,rows_in_partition=$2 WHERE run_uuid=$3 AND partition_uuid=$4`, + time.Now(), rowsInPartition, runUUID, partition.PartitionId, + ); err != nil { return fmt.Errorf("error while updating qrep partition in qrep_partitions: %w", err) } return nil @@ -302,9 +300,10 @@ func UpdatePullEndTimeAndRowsForPartition(ctx context.Context, pool *pgxpool.Poo func UpdateEndTimeForPartition(ctx context.Context, pool *pgxpool.Pool, runUUID string, partition *protos.QRepPartition, ) error { - _, err := pool.Exec(ctx, `UPDATE peerdb_stats.qrep_partitions SET end_time=$1 - WHERE run_uuid=$2 AND partition_uuid=$3`, time.Now(), runUUID, partition.PartitionId) - if err != nil { + if _, err := pool.Exec(ctx, + `UPDATE peerdb_stats.qrep_partitions SET end_time=$1 WHERE run_uuid=$2 AND partition_uuid=$3`, + time.Now(), runUUID, partition.PartitionId, + ); err != nil { return fmt.Errorf("error while updating qrep partition in qrep_partitions: %w", err) } return nil @@ -313,37 +312,33 @@ func UpdateEndTimeForPartition(ctx context.Context, pool *pgxpool.Pool, runUUID func UpdateRowsSyncedForPartition(ctx context.Context, pool *pgxpool.Pool, rowsSynced int, runUUID string, partition *protos.QRepPartition, ) error { - _, err := pool.Exec(ctx, `UPDATE peerdb_stats.qrep_partitions SET rows_synced=$1 - WHERE run_uuid=$2 AND partition_uuid=$3`, rowsSynced, runUUID, partition.PartitionId) - if err != nil { + if _, err := pool.Exec(ctx, + `UPDATE peerdb_stats.qrep_partitions SET rows_synced=$1 WHERE run_uuid=$2 AND partition_uuid=$3`, + rowsSynced, runUUID, partition.PartitionId, + ); err != nil { return fmt.Errorf("error while updating rows_synced in qrep_partitions: %w", err) } return nil } func DeleteMirrorStats(ctx context.Context, pool *pgxpool.Pool, flowJobName string) error { - _, err := pool.Exec(ctx, `DELETE FROM peerdb_stats.qrep_partitions WHERE parent_mirror_name = $1`, flowJobName) - if err != nil { + if _, err := pool.Exec(ctx, `DELETE FROM peerdb_stats.qrep_partitions WHERE parent_mirror_name = $1`, flowJobName); err != nil { return fmt.Errorf("error while deleting qrep_partitions: %w", err) } - _, err = pool.Exec(ctx, `DELETE FROM peerdb_stats.qrep_runs WHERE parent_mirror_name = $1`, flowJobName) - if err != nil { + if _, err := pool.Exec(ctx, `DELETE FROM peerdb_stats.qrep_runs WHERE parent_mirror_name = $1`, flowJobName); err != nil { return fmt.Errorf("error while deleting qrep_runs: %w", err) } - _, err = pool.Exec(ctx, `DELETE FROM peerdb_stats.cdc_batches WHERE flow_name = $1`, flowJobName) - if err != nil { + if _, err := pool.Exec(ctx, `DELETE FROM peerdb_stats.cdc_batches WHERE flow_name = $1`, flowJobName); err != nil { return fmt.Errorf("error while deleting cdc_batches: %w", err) } - _, err = pool.Exec(ctx, `DELETE FROM peerdb_stats.cdc_batch_table WHERE flow_name = $1`, flowJobName) - if err != nil { + if _, err := pool.Exec(ctx, `DELETE FROM peerdb_stats.cdc_batch_table WHERE flow_name = $1`, flowJobName); err != nil { return fmt.Errorf("error while deleting cdc_batch_table: %w", err) } - _, err = pool.Exec(ctx, `DELETE FROM peerdb_stats.cdc_flows WHERE flow_name = $1`, flowJobName) - if err != nil { + if _, err := pool.Exec(ctx, `DELETE FROM peerdb_stats.cdc_flows WHERE flow_name = $1`, flowJobName); err != nil { return fmt.Errorf("error while deleting cdc_flows: %w", err) } From 9927ae0ba206d60ff40e85f58de8cad22836a4bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Sat, 18 Jan 2025 00:00:08 +0000 Subject: [PATCH 79/80] avoid segfault --- flow/activities/snapshot_activity.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/flow/activities/snapshot_activity.go b/flow/activities/snapshot_activity.go index 3312eae63a..aecf31633d 100644 --- a/flow/activities/snapshot_activity.go +++ b/flow/activities/snapshot_activity.go @@ -110,9 +110,13 @@ func (a *SnapshotActivity) MaintainTx(ctx context.Context, sessionID string, pee } a.SnapshotStatesMutex.Lock() - a.TxSnapshotStates[sessionID] = TxSnapshotState{ - SnapshotName: exportSnapshotOutput.SnapshotName, - SupportsTIDScans: exportSnapshotOutput.SupportsTidScans, + if exportSnapshotOutput != nil { + a.TxSnapshotStates[sessionID] = TxSnapshotState{ + SnapshotName: exportSnapshotOutput.SnapshotName, + SupportsTIDScans: exportSnapshotOutput.SupportsTidScans, + } + } else { + a.TxSnapshotStates[sessionID] = TxSnapshotState{} } a.SnapshotStatesMutex.Unlock() From 2d4c504dcd7a4a67d3dd13c7aae75947f1d14f8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Dub=C3=A9?= Date: Sat, 18 Jan 2025 00:48:24 +0000 Subject: [PATCH 80/80] " --- flow/e2e/clickhouse/peer_flow_ch_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/e2e/clickhouse/peer_flow_ch_test.go b/flow/e2e/clickhouse/peer_flow_ch_test.go index 650f4f1150..5e636adee1 100644 --- a/flow/e2e/clickhouse/peer_flow_ch_test.go +++ b/flow/e2e/clickhouse/peer_flow_ch_test.go @@ -173,8 +173,8 @@ func (s ClickHouseSuite) Test_Addition_Removal() { afterRemoveRunID := e2e.EnvGetRunID(s.t, env) require.NotEqual(s.t, runID, afterRemoveRunID) - require.NoError(s.t, s.source.Exec(fmt.Sprintf("INSERT INTO %s (key) VALUES ('test')", srcTableName))) - require.NoError(s.t, s.source.Exec(fmt.Sprintf("INSERT INTO %s (key) VALUES ('test')", addedSrcTableName))) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`INSERT INTO %s ("key") VALUES ('test')`, srcTableName))) + require.NoError(s.t, s.source.Exec(fmt.Sprintf(`INSERT INTO %s ("key") VALUES ('test')`, addedSrcTableName))) e2e.EnvWaitForEqualTablesWithNames(env, s, "second insert to added table", "test_table_add_remove_added", addedDstTableName, "id,\"key\"")