Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

better set_body #42

Merged
merged 4 commits into from
May 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 51 additions & 33 deletions ds/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,9 +248,9 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
if err != nil {
return starlark.None, err
}
provider.SetBodyFile(qfs.NewMemfileBytes("data.json", data))
provider.SetBodyFile(qfs.NewMemfileBytes("body.json", data))

rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("data.json", data))
rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("body.json", data))
if err != nil {
return starlark.None, fmt.Errorf("error allocating data reader: %s", err)
}
Expand Down Expand Up @@ -278,12 +278,11 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
// even if assigned value is the same as what was already there.
func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
var (
data starlark.Value
raw starlark.Bool
dataFormat starlark.String
data starlark.Value
parseAs starlark.String
)

if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "raw?", &raw, "data_format", &dataFormat); err != nil {
if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "parse_as?", &parseAs); err != nil {
return starlark.None, err
}

Expand All @@ -295,58 +294,77 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
return starlark.None, err
}

df := dataFormat.GoString()
if df == "" {
// default to json
df = "json"
if err := d.checkField("structure"); err != nil {
err = fmt.Errorf("cannot use a transform to set the body of a dataset and manually adjust structure at the same time")
return starlark.None, err
}

if _, err := dataset.ParseDataFormatString(df); err != nil {
return starlark.None, fmt.Errorf("invalid data_format: '%s'", df)
}
df := parseAs.GoString()
if df != "" {
if _, err := dataset.ParseDataFormatString(df); err != nil {
return starlark.None, fmt.Errorf("invalid parse_as format: '%s'", df)
}

if raw {
if str, ok := data.(starlark.String); ok {
d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str))))
d.modBody = true
d.bodyCache = nil
return starlark.None, nil
str, ok := data.(starlark.String)
if !ok {
return starlark.None, fmt.Errorf("expected data for '%s' format to be a string", df)
}

return starlark.None, fmt.Errorf("expected raw data for body to be a string")
d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", df), []byte(string(str))))
d.modBody = true
d.bodyCache = nil
return starlark.None, nil
}

iter, ok := data.(starlark.Iterable)
if !ok {
return starlark.None, fmt.Errorf("expected body to be iterable")
}

sch := dataset.BaseSchemaArray
if data.Type() == "dict" {
sch = dataset.BaseSchemaObject
return starlark.None, fmt.Errorf("expected body data to be iterable")
}

st := &dataset.Structure{
Format: df,
Schema: sch,
}
d.write.Structure = d.writeStructure(data)

w, err := dsio.NewEntryBuffer(st)
w, err := dsio.NewEntryBuffer(d.write.Structure)
if err != nil {
return starlark.None, err
}

r := NewEntryReader(st, iter)
r := NewEntryReader(d.write.Structure, iter)
if err := dsio.Copy(r, w); err != nil {
return starlark.None, err
}
if err := w.Close(); err != nil {
return starlark.None, err
}

d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), w.Bytes()))
d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", d.write.Structure.Format), w.Bytes()))
d.modBody = true
d.bodyCache = nil

return starlark.None, nil
}

// writeStructure determines the destination data structure for writing a
// dataset body, falling back to a default json structure based on input values
// if no prior structure exists
func (d *Dataset) writeStructure(data starlark.Value) *dataset.Structure {
// if the write structure has been set, use that
if d.write != nil && d.write.Structure != nil {
return d.write.Structure
}

// fall back to inheriting from read structure
if d.read != nil && d.read.Structure != nil {
return d.read.Structure
}

// use a default of json as a last resort
sch := dataset.BaseSchemaArray
if data.Type() == "dict" {
sch = dataset.BaseSchemaObject
}

return &dataset.Structure{
Format: "json",
Schema: sch,
}
}
38 changes: 37 additions & 1 deletion ds/dataset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,9 @@ func TestFile(t *testing.T) {
starlarktest.SetReporter(thread, t)

// Execute test file
_, err := starlark.ExecFile(thread, "testdata/test.star", nil, nil)
_, err := starlark.ExecFile(thread, "testdata/test.star", nil, starlark.StringDict{
"csv_ds": csvDataset().Methods(),
})
if err != nil {
if ee, ok := err.(*starlark.EvalError); ok {
t.Error(ee.Backtrace())
Expand All @@ -200,3 +202,37 @@ func newLoader() func(thread *starlark.Thread, module string) (starlark.StringDi
return nil, fmt.Errorf("invalid module")
}
}

func csvDataset() *Dataset {
text := `title,count,is great
foo,1,true
bar,2,false
bat,3,meh
`
ds := &dataset.Dataset{
Structure: &dataset.Structure{
Format: "csv",
FormatConfig: map[string]interface{}{
"headerRow": true,
},
Schema: map[string]interface{}{
"type": "array",
"items": map[string]interface{}{
"type": "array",
"items": []interface{}{
map[string]interface{}{"title": "title", "type": "string"},
map[string]interface{}{"title": "count", "type": "integer"},
map[string]interface{}{"title": "is great", "type": "string"},
},
},
},
},
}
ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(text)))

d := NewDataset(ds, nil)
d.SetMutable(&dataset.Dataset{
Structure: ds.Structure,
})
return d
}
28 changes: 28 additions & 0 deletions ds/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*Package ds defines the qri dataset object within starlark

outline: ds
ds defines the qri dataset object within starlark. it's loaded by default
in the qri runtime

types:
Dataset
a qri dataset. Datasets can be either read-only or read-write. By default datasets are read-write
methods:
set_meta(meta dict)
set dataset meta component
get_meta() dict|None
get dataset meta component
get_structure() dict|None
get dataset structure component if one is defined
set_structure(structure) structure
set dataset structure component
get_body() dict|list|None
get dataset body component if one is defined
set_body(data dict|list, parse_as? string) body
set dataset body component. set_body has only one optional argument: 'parse_as', which defaults to the
empty string. By default qri assumes the data value provided to set_body is an iterable starlark data
structure (tuple, set, list, dict). When parse_as is set, set_body assumes the provided body value will
be a string of serialized structured data in the given format. valid parse_as values are "json", "csv",
"cbor", "xlsx".
*/
package ds
14 changes: 12 additions & 2 deletions ds/testdata/test.star
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,17 @@ bd_obj = {'a': [1,2,3]}

assert.eq(ds.set_body(bd_obj), None)
assert.eq(ds.set_body(bd), None)
assert.eq(ds.set_body("[[1,2,3]]", raw=True), None)
assert.eq(ds.set_body("[[1,2,3]]", parse_as="json"), None)

# TODO - haven't thought through this yet
assert.eq(ds.get_body(), bd)
assert.eq(ds.get_body(), bd)

# csv_ds is a global variable provided by dataset_test.go
# round-tripping csv data through starlark shouldn't have significant effects on the
# encoded data. whitespace is *not* significant.
# csv data is one of the harder formats, where there header row must be preserved
csv_ds.set_body(csv_ds.get_body())

expect_data = [["foo",1,"true"], ["bar",2,"false"], ["bat",3,"meh"]]
assert.eq(expect_data, csv_ds.get_body())
assert.eq(csv_ds.get_structure()['format'], 'csv')