diff --git a/ds/dataset.go b/ds/dataset.go index 8fda315..fed1802 100644 --- a/ds/dataset.go +++ b/ds/dataset.go @@ -248,9 +248,9 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta if err != nil { return starlark.None, err } - provider.SetBodyFile(qfs.NewMemfileBytes("data.json", data)) + provider.SetBodyFile(qfs.NewMemfileBytes("body.json", data)) - rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("data.json", data)) + rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("body.json", data)) if err != nil { return starlark.None, fmt.Errorf("error allocating data reader: %s", err) } @@ -278,12 +278,11 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta // even if assigned value is the same as what was already there. func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { var ( - data starlark.Value - raw starlark.Bool - dataFormat starlark.String + data starlark.Value + parseAs starlark.String ) - if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "raw?", &raw, "data_format", &dataFormat); err != nil { + if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "parse_as?", &parseAs); err != nil { return starlark.None, err } @@ -295,48 +294,41 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta return starlark.None, err } - df := dataFormat.GoString() - if df == "" { - // default to json - df = "json" + if err := d.checkField("structure"); err != nil { + err = fmt.Errorf("cannot use a transform to set the body of a dataset and manually adjust structure at the same time") + return starlark.None, err } - if _, err := dataset.ParseDataFormatString(df); err != nil { - return starlark.None, fmt.Errorf("invalid data_format: '%s'", df) - } + df := parseAs.GoString() + if df != "" { + if _, err := dataset.ParseDataFormatString(df); err != nil { + return starlark.None, fmt.Errorf("invalid parse_as format: '%s'", df) + } - if raw { - if str, ok := data.(starlark.String); ok { - d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str)))) - d.modBody = true - d.bodyCache = nil - return starlark.None, nil + str, ok := data.(starlark.String) + if !ok { + return starlark.None, fmt.Errorf("expected data for '%s' format to be a string", df) } - return starlark.None, fmt.Errorf("expected raw data for body to be a string") + d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", df), []byte(string(str)))) + d.modBody = true + d.bodyCache = nil + return starlark.None, nil } iter, ok := data.(starlark.Iterable) if !ok { - return starlark.None, fmt.Errorf("expected body to be iterable") - } - - sch := dataset.BaseSchemaArray - if data.Type() == "dict" { - sch = dataset.BaseSchemaObject + return starlark.None, fmt.Errorf("expected body data to be iterable") } - st := &dataset.Structure{ - Format: df, - Schema: sch, - } + d.write.Structure = d.writeStructure(data) - w, err := dsio.NewEntryBuffer(st) + w, err := dsio.NewEntryBuffer(d.write.Structure) if err != nil { return starlark.None, err } - r := NewEntryReader(st, iter) + r := NewEntryReader(d.write.Structure, iter) if err := dsio.Copy(r, w); err != nil { return starlark.None, err } @@ -344,9 +336,35 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta return starlark.None, err } - d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), w.Bytes())) + d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", d.write.Structure.Format), w.Bytes())) d.modBody = true d.bodyCache = nil return starlark.None, nil } + +// writeStructure determines the destination data structure for writing a +// dataset body, falling back to a default json structure based on input values +// if no prior structure exists +func (d *Dataset) writeStructure(data starlark.Value) *dataset.Structure { + // if the write structure has been set, use that + if d.write != nil && d.write.Structure != nil { + return d.write.Structure + } + + // fall back to inheriting from read structure + if d.read != nil && d.read.Structure != nil { + return d.read.Structure + } + + // use a default of json as a last resort + sch := dataset.BaseSchemaArray + if data.Type() == "dict" { + sch = dataset.BaseSchemaObject + } + + return &dataset.Structure{ + Format: "json", + Schema: sch, + } +} diff --git a/ds/dataset_test.go b/ds/dataset_test.go index 5f392ef..5be0ecf 100644 --- a/ds/dataset_test.go +++ b/ds/dataset_test.go @@ -177,7 +177,9 @@ func TestFile(t *testing.T) { starlarktest.SetReporter(thread, t) // Execute test file - _, err := starlark.ExecFile(thread, "testdata/test.star", nil, nil) + _, err := starlark.ExecFile(thread, "testdata/test.star", nil, starlark.StringDict{ + "csv_ds": csvDataset().Methods(), + }) if err != nil { if ee, ok := err.(*starlark.EvalError); ok { t.Error(ee.Backtrace()) @@ -200,3 +202,37 @@ func newLoader() func(thread *starlark.Thread, module string) (starlark.StringDi return nil, fmt.Errorf("invalid module") } } + +func csvDataset() *Dataset { + text := `title,count,is great +foo,1,true +bar,2,false +bat,3,meh +` + ds := &dataset.Dataset{ + Structure: &dataset.Structure{ + Format: "csv", + FormatConfig: map[string]interface{}{ + "headerRow": true, + }, + Schema: map[string]interface{}{ + "type": "array", + "items": map[string]interface{}{ + "type": "array", + "items": []interface{}{ + map[string]interface{}{"title": "title", "type": "string"}, + map[string]interface{}{"title": "count", "type": "integer"}, + map[string]interface{}{"title": "is great", "type": "string"}, + }, + }, + }, + }, + } + ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(text))) + + d := NewDataset(ds, nil) + d.SetMutable(&dataset.Dataset{ + Structure: ds.Structure, + }) + return d +} diff --git a/ds/doc.go b/ds/doc.go new file mode 100644 index 0000000..135ec01 --- /dev/null +++ b/ds/doc.go @@ -0,0 +1,28 @@ +/*Package ds defines the qri dataset object within starlark + + outline: ds + ds defines the qri dataset object within starlark. it's loaded by default + in the qri runtime + + types: + Dataset + a qri dataset. Datasets can be either read-only or read-write. By default datasets are read-write + methods: + set_meta(meta dict) + set dataset meta component + get_meta() dict|None + get dataset meta component + get_structure() dict|None + get dataset structure component if one is defined + set_structure(structure) structure + set dataset structure component + get_body() dict|list|None + get dataset body component if one is defined + set_body(data dict|list, parse_as? string) body + set dataset body component. set_body has only one optional argument: 'parse_as', which defaults to the + empty string. By default qri assumes the data value provided to set_body is an iterable starlark data + structure (tuple, set, list, dict). When parse_as is set, set_body assumes the provided body value will + be a string of serialized structured data in the given format. valid parse_as values are "json", "csv", + "cbor", "xlsx". +*/ +package ds diff --git a/ds/testdata/test.star b/ds/testdata/test.star index 0bc042f..3ff66b0 100644 --- a/ds/testdata/test.star +++ b/ds/testdata/test.star @@ -32,7 +32,17 @@ bd_obj = {'a': [1,2,3]} assert.eq(ds.set_body(bd_obj), None) assert.eq(ds.set_body(bd), None) -assert.eq(ds.set_body("[[1,2,3]]", raw=True), None) +assert.eq(ds.set_body("[[1,2,3]]", parse_as="json"), None) # TODO - haven't thought through this yet -assert.eq(ds.get_body(), bd) \ No newline at end of file +assert.eq(ds.get_body(), bd) + +# csv_ds is a global variable provided by dataset_test.go +# round-tripping csv data through starlark shouldn't have significant effects on the +# encoded data. whitespace is *not* significant. +# csv data is one of the harder formats, where there header row must be preserved +csv_ds.set_body(csv_ds.get_body()) + +expect_data = [["foo",1,"true"], ["bar",2,"false"], ["bat",3,"meh"]] +assert.eq(expect_data, csv_ds.get_body()) +assert.eq(csv_ds.get_structure()['format'], 'csv')