Skip to content

Commit

Permalink
feat: prevent mixing preview rows on unmodified tables
Browse files Browse the repository at this point in the history
  • Loading branch information
Shivaprasad committed Jul 13, 2023
1 parent 7ab8cd0 commit 78e1aae
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 56 deletions.
40 changes: 17 additions & 23 deletions plugins/extractors/bigquery/bigquery.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,32 @@ package bigquery

import (
"context"
"crypto/rand"
_ "embed" // used to print the embedded assets
"encoding/base64"
"encoding/json"
"fmt"
"html/template"
"math/big"
"math/rand"
"strings"
"sync"

"cloud.google.com/go/bigquery"
datacatalog "cloud.google.com/go/datacatalog/apiv1"
"cloud.google.com/go/datacatalog/apiv1/datacatalogpb"
"github.com/goto/meteor/models"
v1beta2 "github.com/goto/meteor/models/gotocompany/assets/v1beta2"
"github.com/goto/meteor/plugins"
"github.com/goto/meteor/plugins/extractors/bigquery/auditlog"
"github.com/goto/meteor/registry"
"github.com/goto/meteor/utils"
"github.com/goto/salt/log"
"github.com/pkg/errors"
"google.golang.org/api/iterator"
"google.golang.org/api/option"
"google.golang.org/protobuf/types/known/anypb"
"google.golang.org/protobuf/types/known/structpb"
"google.golang.org/protobuf/types/known/timestamppb"

Check failure on line 24 in plugins/extractors/bigquery/bigquery.go

View workflow job for this annotation

GitHub Actions / golangci

File is not `gci`-ed with --skip-generated -s standard,default (gci)
"github.com/goto/meteor/models"
v1beta2 "github.com/goto/meteor/models/gotocompany/assets/v1beta2"
"github.com/goto/meteor/plugins"
"github.com/goto/meteor/plugins/extractors/bigquery/auditlog"
"github.com/goto/meteor/registry"
"github.com/goto/meteor/utils"
)

//go:embed README.md
Expand Down Expand Up @@ -109,7 +109,7 @@ type Extractor struct {
randFn randFn
}

type randFn func(int64) (int64, error)
type randFn func(max, rndSeed int64) int64

type NewClientFunc func(ctx context.Context, logger log.Logger, config *Config) (*bigquery.Client, error)

Expand Down Expand Up @@ -301,7 +301,7 @@ func (e *Extractor) buildAsset(ctx context.Context, t *bigquery.Table, md *bigqu
var previewRows *structpb.ListValue
if md.Type == bigquery.RegularTable {
var err error
previewFields, previewRows, err = e.buildPreview(ctx, t)
previewFields, previewRows, err = e.buildPreview(ctx, t, md)
if err != nil {
e.logger.Warn("error building preview", "err", err, "table", tableFQN)
}
Expand Down Expand Up @@ -390,7 +390,7 @@ func (e *Extractor) buildColumn(ctx context.Context, field *bigquery.FieldSchema
return col
}

func (e *Extractor) buildPreview(ctx context.Context, t *bigquery.Table) (fields []string, rows *structpb.ListValue, err error) {
func (e *Extractor) buildPreview(ctx context.Context, t *bigquery.Table, md *bigquery.TableMetadata) (fields []string, rows *structpb.ListValue, err error) {

Check failure on line 393 in plugins/extractors/bigquery/bigquery.go

View workflow job for this annotation

GitHub Actions / golangci

cognitive complexity 21 of func `(*Extractor).buildPreview` is high (> 20) (gocognit)
maxPreviewRows := e.config.MaxPreviewRows
if maxPreviewRows == 0 {
return nil, nil, nil
Expand Down Expand Up @@ -445,7 +445,7 @@ func (e *Extractor) buildPreview(ctx context.Context, t *bigquery.Table) (fields

// if mix values true, then randomize the values
if e.config.MixValues {
tempRows, err = e.mixValues(tempRows)
tempRows, err = e.mixValues(tempRows, md.LastModifiedTime.Unix())
if err != nil {
return nil, nil, fmt.Errorf("mix values: %w", err)
}
Expand All @@ -459,7 +459,7 @@ func (e *Extractor) buildPreview(ctx context.Context, t *bigquery.Table) (fields
return fields, rows, nil
}

func (e *Extractor) mixValues(rows []interface{}) ([]interface{}, error) {
func (e *Extractor) mixValues(rows []interface{}, rndSeed int64) ([]interface{}, error) {
numRows := len(rows)
if numRows < 2 {
return rows, nil
Expand All @@ -473,10 +473,7 @@ func (e *Extractor) mixValues(rows []interface{}) ([]interface{}, error) {
for col := 0; col < numColumns; col++ {
for row := numRows - 1; row > 0; row-- {
// Generate a random index within the range [0, row+1)
swapRow, err := e.randFn(int64(row + 1))
if err != nil {
return nil, err
}
swapRow := e.randFn(int64(row+1), rndSeed)

// Swap the values in the current row and the randomly selected row
a, ok := mixedRows[row].([]interface{})
Expand Down Expand Up @@ -634,10 +631,7 @@ func init() {
}
}

func cryptoRandom(max int64) (int64, error) {
i, err := rand.Int(rand.Reader, big.NewInt(max))
if err != nil {
return 0, err
}
return i.Int64(), nil
func cryptoRandom(max, rndSeed int64) int64 {
rnd := rand.New(rand.NewSource(rndSeed))

Check failure on line 635 in plugins/extractors/bigquery/bigquery.go

View workflow job for this annotation

GitHub Actions / golangci

G404: Use of weak random number generator (math/rand instead of crypto/rand) (gosec)
return rnd.Int63n(max)
}
45 changes: 12 additions & 33 deletions plugins/extractors/bigquery/bigquery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,13 @@ package bigquery_test

import (
"context"
"errors"
"fmt"
"log"
"math/rand"
"os"
"testing"
"time"

bq "cloud.google.com/go/bigquery"
v1beta2 "github.com/goto/meteor/models/gotocompany/assets/v1beta2"
"github.com/goto/meteor/plugins"
"github.com/goto/meteor/plugins/extractors/bigquery"
"github.com/goto/meteor/test/mocks"
"github.com/goto/meteor/test/utils"
slog "github.com/goto/salt/log"
"github.com/nsf/jsondiff"
"github.com/ory/dockertest/v3"
Expand All @@ -27,6 +20,12 @@ import (
"google.golang.org/api/option"
"google.golang.org/protobuf/types/known/anypb"
"google.golang.org/protobuf/types/known/timestamppb"

v1beta2 "github.com/goto/meteor/models/gotocompany/assets/v1beta2"
"github.com/goto/meteor/plugins"
"github.com/goto/meteor/plugins/extractors/bigquery"
"github.com/goto/meteor/test/mocks"
"github.com/goto/meteor/test/utils"
)

const (
Expand Down Expand Up @@ -156,7 +155,7 @@ func TestInit(t *testing.T) {
}

func TestExtract(t *testing.T) {
runTest := func(t *testing.T, cfg plugins.Config, randomizer func(int64) (int64, error)) []*v1beta2.Asset {
runTest := func(t *testing.T, cfg plugins.Config, randomizer func(max, seed int64) int64) []*v1beta2.Asset {
extr := bigquery.New(utils.Logger, mockClient, randomizer)
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel()
Expand Down Expand Up @@ -204,46 +203,26 @@ func TestExtract(t *testing.T) {
},
}

randFn := func(seed int64) func(max int64) (int64, error) {
randomizer := rand.New(rand.NewSource(seed))

return func(max int64) (int64, error) {
return randomizer.Int63n(max), nil
}
randFn := func(max, seed int64) int64 {
return 10
}

t.Run("should return preview rows with mixed values", func(t *testing.T) {
seed := int64(42)
randomizer := randFn(seed)

actual := runTest(t, cfg, randomizer)
actual := runTest(t, cfg, randFn)

utils.AssertJSONFile(t, "testdata/expected-assets-mixed.json", actual, jsondiff.FullMatch)
})

t.Run("with different seed should not equal to expected", func(t *testing.T) {
seed := int64(99)
randomizer := randFn(seed)

actual := runTest(t, cfg, randomizer)
actual := runTest(t, cfg, randFn)
utils.AssertJSONFile(t, "testdata/expected-assets-mixed.json", actual, jsondiff.NoMatch)
})

t.Run("should not build preview rows when randomFn fails", func(t *testing.T) {
actual := runTest(t, cfg, func(max int64) (int64, error) {
return 0, errors.New("randomizer failed")
})
utils.AssertJSONFile(t, "testdata/expected-assets-no-preview.json", actual, jsondiff.FullMatch)
})

t.Run("should not randomize if rows < 2", func(t *testing.T) {
seed := int64(42)
randomizer := randFn(seed)

newCfg := cfg
newCfg.RawConfig["max_preview_rows"] = "1"

actual := runTest(t, newCfg, randomizer)
actual := runTest(t, newCfg, randFn)
utils.AssertJSONFile(t, "testdata/expected-assets.json", actual, jsondiff.FullMatch)
})
})
Expand Down

0 comments on commit 78e1aae

Please sign in to comment.