qri-io · b5 · May 16, 2019 · May 16, 2019 · May 16, 2019 · May 16, 2019
diff --git a/ds/dataset.go b/ds/dataset.go
@@ -248,9 +248,9 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 	if err != nil {
 		return starlark.None, err
 	}
-	provider.SetBodyFile(qfs.NewMemfileBytes("data.json", data))
+	provider.SetBodyFile(qfs.NewMemfileBytes("body.json", data))
 
-	rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("data.json", data))
+	rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("body.json", data))
 	if err != nil {
 		return starlark.None, fmt.Errorf("error allocating data reader: %s", err)
 	}
@@ -278,12 +278,11 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 // even if assigned value is the same as what was already there.
 func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
 	var (
-		data       starlark.Value
-		raw        starlark.Bool
-		dataFormat starlark.String
+		data    starlark.Value
+		parseAs starlark.String
 	)
 
-	if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "raw?", &raw, "data_format", &dataFormat); err != nil {
+	if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "parse_as?", &parseAs); err != nil {
 		return starlark.None, err
 	}
 
@@ -295,58 +294,77 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 		return starlark.None, err
 	}
 
-	df := dataFormat.GoString()
-	if df == "" {
-		// default to json
-		df = "json"
+	if err := d.checkField("structure"); err != nil {
+		err = fmt.Errorf("cannot use a transform to set the body of a dataset and manually adjust structure at the same time")
+		return starlark.None, err
 	}
 
-	if _, err := dataset.ParseDataFormatString(df); err != nil {
-		return starlark.None, fmt.Errorf("invalid data_format: '%s'", df)
-	}
+	df := parseAs.GoString()
+	if df != "" {
+		if _, err := dataset.ParseDataFormatString(df); err != nil {
+			return starlark.None, fmt.Errorf("invalid parse_as format: '%s'", df)
+		}
 
-	if raw {
-		if str, ok := data.(starlark.String); ok {
-			d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str))))
-			d.modBody = true
-			d.bodyCache = nil
-			return starlark.None, nil
+		str, ok := data.(starlark.String)
+		if !ok {
+			return starlark.None, fmt.Errorf("expected data for '%s' format to be a string", df)
 		}
 
-		return starlark.None, fmt.Errorf("expected raw data for body to be a string")
+		d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", df), []byte(string(str))))
+		d.modBody = true
+		d.bodyCache = nil
+		return starlark.None, nil
 	}
 
 	iter, ok := data.(starlark.Iterable)
 	if !ok {
-		return starlark.None, fmt.Errorf("expected body to be iterable")
-	}
-
-	sch := dataset.BaseSchemaArray
-	if data.Type() == "dict" {
-		sch = dataset.BaseSchemaObject
+		return starlark.None, fmt.Errorf("expected body data to be iterable")
 	}
 
-	st := &dataset.Structure{
-		Format: df,
-		Schema: sch,
-	}
+	d.write.Structure = d.writeStructure(data)
 
-	w, err := dsio.NewEntryBuffer(st)
+	w, err := dsio.NewEntryBuffer(d.write.Structure)
 	if err != nil {
 		return starlark.None, err
 	}
 
-	r := NewEntryReader(st, iter)
+	r := NewEntryReader(d.write.Structure, iter)
 	if err := dsio.Copy(r, w); err != nil {
 		return starlark.None, err
 	}
 	if err := w.Close(); err != nil {
 		return starlark.None, err
 	}
 
-	d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), w.Bytes()))
+	d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", d.write.Structure.Format), w.Bytes()))
 	d.modBody = true
 	d.bodyCache = nil
 
 	return starlark.None, nil
 }
+
+// writeStructure determines the destination data structure for writing a
+// dataset body, falling back to a default json structure based on input values
+// if no prior structure exists
+func (d *Dataset) writeStructure(data starlark.Value) *dataset.Structure {
+	// if the write structure has been set, use that
+	if d.write != nil && d.write.Structure != nil {
+		return d.write.Structure
+	}
+
+	// fall back to inheriting from read structure
+	if d.read != nil && d.read.Structure != nil {
+		return d.read.Structure
+	}
+
+	// use a default of json as a last resort
+	sch := dataset.BaseSchemaArray
+	if data.Type() == "dict" {
+		sch = dataset.BaseSchemaObject
+	}
+
+	return &dataset.Structure{
+		Format: "json",
+		Schema: sch,
+	}
+}
diff --git a/ds/dataset_test.go b/ds/dataset_test.go
@@ -177,7 +177,9 @@ func TestFile(t *testing.T) {
 	starlarktest.SetReporter(thread, t)
 
 	// Execute test file
-	_, err := starlark.ExecFile(thread, "testdata/test.star", nil, nil)
+	_, err := starlark.ExecFile(thread, "testdata/test.star", nil, starlark.StringDict{
+		"csv_ds": csvDataset().Methods(),
+	})
 	if err != nil {
 		if ee, ok := err.(*starlark.EvalError); ok {
 			t.Error(ee.Backtrace())
@@ -200,3 +202,37 @@ func newLoader() func(thread *starlark.Thread, module string) (starlark.StringDi
 		return nil, fmt.Errorf("invalid module")
 	}
 }
+
+func csvDataset() *Dataset {
+	text := `title,count,is great
+foo,1,true
+bar,2,false
+bat,3,meh
+`
+	ds := &dataset.Dataset{
+		Structure: &dataset.Structure{
+			Format: "csv",
+			FormatConfig: map[string]interface{}{
+				"headerRow": true,
+			},
+			Schema: map[string]interface{}{
+				"type": "array",
+				"items": map[string]interface{}{
+					"type": "array",
+					"items": []interface{}{
+						map[string]interface{}{"title": "title", "type": "string"},
+						map[string]interface{}{"title": "count", "type": "integer"},
+						map[string]interface{}{"title": "is great", "type": "string"},
+					},
+				},
+			},
+		},
+	}
+	ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(text)))
+
+	d := NewDataset(ds, nil)
+	d.SetMutable(&dataset.Dataset{
+		Structure: ds.Structure,
+	})
+	return d
+}
diff --git a/ds/doc.go b/ds/doc.go
@@ -0,0 +1,28 @@
+/*Package ds defines the qri dataset object within starlark
+
+  outline: ds
+    ds defines the qri dataset object within starlark. it's loaded by default
+    in the qri runtime
+
+    types:
+      Dataset
+        a qri dataset. Datasets can be either read-only or read-write. By default datasets are read-write
+        methods:
+          set_meta(meta dict)
+            set dataset meta component
+          get_meta() dict|None
+            get dataset meta component
+          get_structure() dict|None
+            get dataset structure component if one is defined
+          set_structure(structure) structure
+            set dataset structure component
+          get_body() dict|list|None
+            get dataset body component if one is defined
+          set_body(data dict|list, parse_as? string) body
+            set dataset body component. set_body has only one optional argument: 'parse_as', which defaults to the
+            empty string. By default qri assumes the data value provided to set_body is an iterable starlark data
+            structure (tuple, set, list, dict). When parse_as is set, set_body assumes the provided body value will
+            be a string of serialized structured data in the given format. valid parse_as values are "json", "csv",
+            "cbor", "xlsx".
+*/
+package ds
diff --git a/ds/testdata/test.star b/ds/testdata/test.star
@@ -32,7 +32,17 @@ bd_obj = {'a': [1,2,3]}
 
 assert.eq(ds.set_body(bd_obj), None)
 assert.eq(ds.set_body(bd), None)
-assert.eq(ds.set_body("[[1,2,3]]", raw=True), None)
+assert.eq(ds.set_body("[[1,2,3]]", parse_as="json"), None)
 
 # TODO - haven't thought through this yet
-assert.eq(ds.get_body(), bd)
+assert.eq(ds.get_body(), bd)
+
+# csv_ds is a global variable provided by dataset_test.go
+# round-tripping csv data through starlark shouldn't have significant effects on the 
+# encoded data. whitespace is *not* significant.
+# csv data is one of the harder formats, where there header row must be preserved
+csv_ds.set_body(csv_ds.get_body())
+
+expect_data = [["foo",1,"true"], ["bar",2,"false"], ["bat",3,"meh"]]
+assert.eq(expect_data, csv_ds.get_body())
+assert.eq(csv_ds.get_structure()['format'], 'csv')