From 97fd9dad727901c6d370f7cbfd2b01166296c2df Mon Sep 17 00:00:00 2001 From: b5 Date: Thu, 16 May 2019 11:21:33 -0400 Subject: [PATCH 1/4] fix(ds): ds.set_body inherits structure instead of inventing one set_body wasn't doing the right thing when it comes to determining the destination datset structure for writing a body. We now inherit structure from existing dataset data, and only fall back to json if no structure exists. The general problem here is a separation of concerns between data in the starlark runtime and dataset serialization formats. Those should always be seperate. closes qri-io/qri#756 --- ds/dataset.go | 47 ++++++++++++++++++++++++++++++++----------- ds/dataset_test.go | 38 +++++++++++++++++++++++++++++++++- ds/testdata/test.star | 12 ++++++++++- 3 files changed, 83 insertions(+), 14 deletions(-) diff --git a/ds/dataset.go b/ds/dataset.go index 8fda315..581828b 100644 --- a/ds/dataset.go +++ b/ds/dataset.go @@ -295,6 +295,11 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta return starlark.None, err } + if err := d.checkField("structure"); err != nil { + err = fmt.Errorf("cannot use a transform to set the body of a dataset and manually adjust structure at the same time") + return starlark.None, err + } + df := dataFormat.GoString() if df == "" { // default to json @@ -321,22 +326,14 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta return starlark.None, fmt.Errorf("expected body to be iterable") } - sch := dataset.BaseSchemaArray - if data.Type() == "dict" { - sch = dataset.BaseSchemaObject - } - - st := &dataset.Structure{ - Format: df, - Schema: sch, - } + d.write.Structure = d.writeStructure(data) - w, err := dsio.NewEntryBuffer(st) + w, err := dsio.NewEntryBuffer(d.write.Structure) if err != nil { return starlark.None, err } - r := NewEntryReader(st, iter) + r := NewEntryReader(d.write.Structure, iter) if err := dsio.Copy(r, w); err != nil { return starlark.None, err } @@ -344,9 +341,35 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta return starlark.None, err } - d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), w.Bytes())) + d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", d.write.Structure.Format), w.Bytes())) d.modBody = true d.bodyCache = nil return starlark.None, nil } + +// writeStructure determines the destination data structure for writing a +// dataset body, falling back to a default json structure based on input values +// if no prior structure exists +func (d *Dataset) writeStructure(data starlark.Value) *dataset.Structure { + // if the write structure has been set, use that + if d.write != nil && d.write.Structure != nil { + return d.write.Structure + } + + // fall back to inheriting from read structure + if d.read != nil && d.read.Structure != nil { + return d.read.Structure + } + + // use a default of json as a last resort + sch := dataset.BaseSchemaArray + if data.Type() == "dict" { + sch = dataset.BaseSchemaObject + } + + return &dataset.Structure{ + Format: "json", + Schema: sch, + } +} diff --git a/ds/dataset_test.go b/ds/dataset_test.go index 5f392ef..de1a89e 100644 --- a/ds/dataset_test.go +++ b/ds/dataset_test.go @@ -177,7 +177,9 @@ func TestFile(t *testing.T) { starlarktest.SetReporter(thread, t) // Execute test file - _, err := starlark.ExecFile(thread, "testdata/test.star", nil, nil) + _, err := starlark.ExecFile(thread, "testdata/test.star", nil, starlark.StringDict{ + "csv_ds": csvDataset().Methods(), + }) if err != nil { if ee, ok := err.(*starlark.EvalError); ok { t.Error(ee.Backtrace()) @@ -200,3 +202,37 @@ func newLoader() func(thread *starlark.Thread, module string) (starlark.StringDi return nil, fmt.Errorf("invalid module") } } + +func csvDataset() *Dataset { + text := `title,count,is great +foo,1,true +bar,2,false +bat,3,meh +` + ds := &dataset.Dataset{ + Structure: &dataset.Structure{ + Format: "csv", + FormatConfig: map[string]interface{}{ + "headerRow": true, + }, + Schema: map[string]interface{}{ + "type": "array", + "items": map[string]interface{}{ + "type": "array", + "items": []interface{}{ + map[string]interface{}{"title": "title", "type": "string"}, + map[string]interface{}{"title": "count", "type": "integer"}, + map[string]interface{}{"title": "is great", "type": "string"}, + }, + }, + }, + }, + } + ds.SetBodyFile(qfs.NewMemfileBytes("data.csv", []byte(text))) + + d := NewDataset(ds, nil) + d.SetMutable(&dataset.Dataset{ + Structure: ds.Structure, + }) + return d +} diff --git a/ds/testdata/test.star b/ds/testdata/test.star index 0bc042f..946321e 100644 --- a/ds/testdata/test.star +++ b/ds/testdata/test.star @@ -35,4 +35,14 @@ assert.eq(ds.set_body(bd), None) assert.eq(ds.set_body("[[1,2,3]]", raw=True), None) # TODO - haven't thought through this yet -assert.eq(ds.get_body(), bd) \ No newline at end of file +assert.eq(ds.get_body(), bd) + +# csv_ds is a global variable provided by dataset_test.go +# "cycling" csv data through starlark shouldn't have significant effects on the +# encoded data. whitespace is *not* significant. +# csv data is one of the harder formats, where there header row must be preserved +csv_ds.set_body(csv_ds.get_body()) + +expect_data = [["foo",1,"true"], ["bar",2,"false"], ["bat",3,"meh"]] +assert.eq(expect_data, csv_ds.get_body()) +assert.eq(csv_ds.get_structure()['format'], 'csv') \ No newline at end of file From df3e15f05517ab26cecd2d43ae286692c784fa6e Mon Sep 17 00:00:00 2001 From: b5 Date: Thu, 16 May 2019 11:51:14 -0400 Subject: [PATCH 2/4] refactor(ds.set_body): replace raw and data_format params with parse_as the mental model behind 'raw' and 'data_format' is off. 'data_format' implies that we're changing the ultimate data format, which is not the case. Now set_body has only one optional argument: 'parse_as', which defaults to the empty string. By default qri assumes the data value provided to set_body is an iterable starlark data structure (tuple, set, list, dict). When parse_as is set, set_body assumes the provided body value will be a string of serialized structured data in the given format. valid parse_as values are "json", "csv", "cbor", "xlsx". We'll need more tests to confirm each format works properly, but all this is doing under the hood is overriding all parsing and setting body bytes directly, which is analogous to "qri save --body file.[parse_as] me/dataset". BREAKING CHANGE: ds.set_body raw and data_format params have been replaced by parse_as --- ds/dataset.go | 37 ++++++++++++++++--------------------- ds/testdata/test.star | 2 +- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/ds/dataset.go b/ds/dataset.go index 581828b..64b9976 100644 --- a/ds/dataset.go +++ b/ds/dataset.go @@ -278,12 +278,11 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta // even if assigned value is the same as what was already there. func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { var ( - data starlark.Value - raw starlark.Bool - dataFormat starlark.String + data starlark.Value + parseAs starlark.String ) - if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "raw?", &raw, "data_format", &dataFormat); err != nil { + if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "parse_as?", &parseAs); err != nil { return starlark.None, err } @@ -300,30 +299,26 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta return starlark.None, err } - df := dataFormat.GoString() - if df == "" { - // default to json - df = "json" - } - - if _, err := dataset.ParseDataFormatString(df); err != nil { - return starlark.None, fmt.Errorf("invalid data_format: '%s'", df) - } + df := parseAs.GoString() + if df != "" { + if _, err := dataset.ParseDataFormatString(df); err != nil { + return starlark.None, fmt.Errorf("invalid parse_as format: '%s'", df) + } - if raw { - if str, ok := data.(starlark.String); ok { - d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str)))) - d.modBody = true - d.bodyCache = nil - return starlark.None, nil + str, ok := data.(starlark.String) + if !ok { + return starlark.None, fmt.Errorf("expected data for '%s' format to be a string", df) } - return starlark.None, fmt.Errorf("expected raw data for body to be a string") + d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str)))) + d.modBody = true + d.bodyCache = nil + return starlark.None, nil } iter, ok := data.(starlark.Iterable) if !ok { - return starlark.None, fmt.Errorf("expected body to be iterable") + return starlark.None, fmt.Errorf("expected body data to be iterable") } d.write.Structure = d.writeStructure(data) diff --git a/ds/testdata/test.star b/ds/testdata/test.star index 946321e..a1e3d7a 100644 --- a/ds/testdata/test.star +++ b/ds/testdata/test.star @@ -32,7 +32,7 @@ bd_obj = {'a': [1,2,3]} assert.eq(ds.set_body(bd_obj), None) assert.eq(ds.set_body(bd), None) -assert.eq(ds.set_body("[[1,2,3]]", raw=True), None) +assert.eq(ds.set_body("[[1,2,3]]", parse_as="json"), None) # TODO - haven't thought through this yet assert.eq(ds.get_body(), bd) From 008ac8ccfdda85ae113842fb772c176bf9037530 Mon Sep 17 00:00:00 2001 From: b5 Date: Thu, 16 May 2019 15:00:38 -0400 Subject: [PATCH 3/4] docs(ds): add initial outline docs for ds --- ds/doc.go | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 ds/doc.go diff --git a/ds/doc.go b/ds/doc.go new file mode 100644 index 0000000..135ec01 --- /dev/null +++ b/ds/doc.go @@ -0,0 +1,28 @@ +/*Package ds defines the qri dataset object within starlark + + outline: ds + ds defines the qri dataset object within starlark. it's loaded by default + in the qri runtime + + types: + Dataset + a qri dataset. Datasets can be either read-only or read-write. By default datasets are read-write + methods: + set_meta(meta dict) + set dataset meta component + get_meta() dict|None + get dataset meta component + get_structure() dict|None + get dataset structure component if one is defined + set_structure(structure) structure + set dataset structure component + get_body() dict|list|None + get dataset body component if one is defined + set_body(data dict|list, parse_as? string) body + set dataset body component. set_body has only one optional argument: 'parse_as', which defaults to the + empty string. By default qri assumes the data value provided to set_body is an iterable starlark data + structure (tuple, set, list, dict). When parse_as is set, set_body assumes the provided body value will + be a string of serialized structured data in the given format. valid parse_as values are "json", "csv", + "cbor", "xlsx". +*/ +package ds From 349cafd5ed1b547858273f3f1414c6fb993a1ce9 Mon Sep 17 00:00:00 2001 From: b5 Date: Thu, 16 May 2019 16:35:14 -0400 Subject: [PATCH 4/4] chore(ds): update 'data' filenames to 'body' --- ds/dataset.go | 6 +++--- ds/dataset_test.go | 2 +- ds/testdata/test.star | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ds/dataset.go b/ds/dataset.go index 64b9976..fed1802 100644 --- a/ds/dataset.go +++ b/ds/dataset.go @@ -248,9 +248,9 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta if err != nil { return starlark.None, err } - provider.SetBodyFile(qfs.NewMemfileBytes("data.json", data)) + provider.SetBodyFile(qfs.NewMemfileBytes("body.json", data)) - rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("data.json", data)) + rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("body.json", data)) if err != nil { return starlark.None, fmt.Errorf("error allocating data reader: %s", err) } @@ -310,7 +310,7 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta return starlark.None, fmt.Errorf("expected data for '%s' format to be a string", df) } - d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str)))) + d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", df), []byte(string(str)))) d.modBody = true d.bodyCache = nil return starlark.None, nil diff --git a/ds/dataset_test.go b/ds/dataset_test.go index de1a89e..5be0ecf 100644 --- a/ds/dataset_test.go +++ b/ds/dataset_test.go @@ -228,7 +228,7 @@ bat,3,meh }, }, } - ds.SetBodyFile(qfs.NewMemfileBytes("data.csv", []byte(text))) + ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(text))) d := NewDataset(ds, nil) d.SetMutable(&dataset.Dataset{ diff --git a/ds/testdata/test.star b/ds/testdata/test.star index a1e3d7a..3ff66b0 100644 --- a/ds/testdata/test.star +++ b/ds/testdata/test.star @@ -38,11 +38,11 @@ assert.eq(ds.set_body("[[1,2,3]]", parse_as="json"), None) assert.eq(ds.get_body(), bd) # csv_ds is a global variable provided by dataset_test.go -# "cycling" csv data through starlark shouldn't have significant effects on the +# round-tripping csv data through starlark shouldn't have significant effects on the # encoded data. whitespace is *not* significant. # csv data is one of the harder formats, where there header row must be preserved csv_ds.set_body(csv_ds.get_body()) expect_data = [["foo",1,"true"], ["bar",2,"false"], ["bat",3,"meh"]] assert.eq(expect_data, csv_ds.get_body()) -assert.eq(csv_ds.get_structure()['format'], 'csv') \ No newline at end of file +assert.eq(csv_ds.get_structure()['format'], 'csv')