From 97fd9dad727901c6d370f7cbfd2b01166296c2df Mon Sep 17 00:00:00 2001
From: b5 <sparkle_pony_2000@qri.io>
Date: Thu, 16 May 2019 11:21:33 -0400
Subject: [PATCH 1/4] fix(ds): ds.set_body inherits structure instead of
 inventing one

set_body wasn't doing the right thing when it comes to determining the
destination datset structure for writing a body. We now inherit structure
from existing dataset data, and only fall back to json if no structure
exists.

The general problem here is a separation of concerns between data in the
starlark runtime and dataset serialization formats. Those should always
be seperate.

closes qri-io/qri#756
---
 ds/dataset.go         | 47 ++++++++++++++++++++++++++++++++-----------
 ds/dataset_test.go    | 38 +++++++++++++++++++++++++++++++++-
 ds/testdata/test.star | 12 ++++++++++-
 3 files changed, 83 insertions(+), 14 deletions(-)

diff --git a/ds/dataset.go b/ds/dataset.go
index 8fda315..581828b 100644
--- a/ds/dataset.go
+++ b/ds/dataset.go
@@ -295,6 +295,11 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 		return starlark.None, err
 	}
 
+	if err := d.checkField("structure"); err != nil {
+		err = fmt.Errorf("cannot use a transform to set the body of a dataset and manually adjust structure at the same time")
+		return starlark.None, err
+	}
+
 	df := dataFormat.GoString()
 	if df == "" {
 		// default to json
@@ -321,22 +326,14 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 		return starlark.None, fmt.Errorf("expected body to be iterable")
 	}
 
-	sch := dataset.BaseSchemaArray
-	if data.Type() == "dict" {
-		sch = dataset.BaseSchemaObject
-	}
-
-	st := &dataset.Structure{
-		Format: df,
-		Schema: sch,
-	}
+	d.write.Structure = d.writeStructure(data)
 
-	w, err := dsio.NewEntryBuffer(st)
+	w, err := dsio.NewEntryBuffer(d.write.Structure)
 	if err != nil {
 		return starlark.None, err
 	}
 
-	r := NewEntryReader(st, iter)
+	r := NewEntryReader(d.write.Structure, iter)
 	if err := dsio.Copy(r, w); err != nil {
 		return starlark.None, err
 	}
@@ -344,9 +341,35 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 		return starlark.None, err
 	}
 
-	d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), w.Bytes()))
+	d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", d.write.Structure.Format), w.Bytes()))
 	d.modBody = true
 	d.bodyCache = nil
 
 	return starlark.None, nil
 }
+
+// writeStructure determines the destination data structure for writing a
+// dataset body, falling back to a default json structure based on input values
+// if no prior structure exists
+func (d *Dataset) writeStructure(data starlark.Value) *dataset.Structure {
+	// if the write structure has been set, use that
+	if d.write != nil && d.write.Structure != nil {
+		return d.write.Structure
+	}
+
+	// fall back to inheriting from read structure
+	if d.read != nil && d.read.Structure != nil {
+		return d.read.Structure
+	}
+
+	// use a default of json as a last resort
+	sch := dataset.BaseSchemaArray
+	if data.Type() == "dict" {
+		sch = dataset.BaseSchemaObject
+	}
+
+	return &dataset.Structure{
+		Format: "json",
+		Schema: sch,
+	}
+}
diff --git a/ds/dataset_test.go b/ds/dataset_test.go
index 5f392ef..de1a89e 100644
--- a/ds/dataset_test.go
+++ b/ds/dataset_test.go
@@ -177,7 +177,9 @@ func TestFile(t *testing.T) {
 	starlarktest.SetReporter(thread, t)
 
 	// Execute test file
-	_, err := starlark.ExecFile(thread, "testdata/test.star", nil, nil)
+	_, err := starlark.ExecFile(thread, "testdata/test.star", nil, starlark.StringDict{
+		"csv_ds": csvDataset().Methods(),
+	})
 	if err != nil {
 		if ee, ok := err.(*starlark.EvalError); ok {
 			t.Error(ee.Backtrace())
@@ -200,3 +202,37 @@ func newLoader() func(thread *starlark.Thread, module string) (starlark.StringDi
 		return nil, fmt.Errorf("invalid module")
 	}
 }
+
+func csvDataset() *Dataset {
+	text := `title,count,is great
+foo,1,true
+bar,2,false
+bat,3,meh
+`
+	ds := &dataset.Dataset{
+		Structure: &dataset.Structure{
+			Format: "csv",
+			FormatConfig: map[string]interface{}{
+				"headerRow": true,
+			},
+			Schema: map[string]interface{}{
+				"type": "array",
+				"items": map[string]interface{}{
+					"type": "array",
+					"items": []interface{}{
+						map[string]interface{}{"title": "title", "type": "string"},
+						map[string]interface{}{"title": "count", "type": "integer"},
+						map[string]interface{}{"title": "is great", "type": "string"},
+					},
+				},
+			},
+		},
+	}
+	ds.SetBodyFile(qfs.NewMemfileBytes("data.csv", []byte(text)))
+
+	d := NewDataset(ds, nil)
+	d.SetMutable(&dataset.Dataset{
+		Structure: ds.Structure,
+	})
+	return d
+}
diff --git a/ds/testdata/test.star b/ds/testdata/test.star
index 0bc042f..946321e 100644
--- a/ds/testdata/test.star
+++ b/ds/testdata/test.star
@@ -35,4 +35,14 @@ assert.eq(ds.set_body(bd), None)
 assert.eq(ds.set_body("[[1,2,3]]", raw=True), None)
 
 # TODO - haven't thought through this yet
-assert.eq(ds.get_body(), bd)
\ No newline at end of file
+assert.eq(ds.get_body(), bd)
+
+# csv_ds is a global variable provided by dataset_test.go
+# "cycling" csv data through starlark shouldn't have significant effects on the 
+# encoded data. whitespace is *not* significant.
+# csv data is one of the harder formats, where there header row must be preserved
+csv_ds.set_body(csv_ds.get_body())
+
+expect_data = [["foo",1,"true"], ["bar",2,"false"], ["bat",3,"meh"]]
+assert.eq(expect_data, csv_ds.get_body())
+assert.eq(csv_ds.get_structure()['format'], 'csv')
\ No newline at end of file

From df3e15f05517ab26cecd2d43ae286692c784fa6e Mon Sep 17 00:00:00 2001
From: b5 <sparkle_pony_2000@qri.io>
Date: Thu, 16 May 2019 11:51:14 -0400
Subject: [PATCH 2/4] refactor(ds.set_body): replace raw and data_format params
 with parse_as

the mental model behind 'raw' and 'data_format' is off. 'data_format' implies
that we're changing the ultimate data format, which is not the case.

Now set_body has only one optional argument: 'parse_as', which defaults to the
empty string. By default qri assumes the data value provided to set_body is
an iterable starlark data structure (tuple, set, list, dict). When parse_as
is set, set_body assumes the provided body value will be a string of serialized
structured data in the given format. valid parse_as values are "json", "csv",
"cbor", "xlsx".

We'll need more tests to confirm each format works properly, but all this is
doing under the hood is overriding all parsing and setting body bytes directly,
which is analogous to "qri save --body file.[parse_as] me/dataset".

BREAKING CHANGE:
ds.set_body raw and data_format params have been replaced by parse_as
---
 ds/dataset.go         | 37 ++++++++++++++++---------------------
 ds/testdata/test.star |  2 +-
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/ds/dataset.go b/ds/dataset.go
index 581828b..64b9976 100644
--- a/ds/dataset.go
+++ b/ds/dataset.go
@@ -278,12 +278,11 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 // even if assigned value is the same as what was already there.
 func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
 	var (
-		data       starlark.Value
-		raw        starlark.Bool
-		dataFormat starlark.String
+		data    starlark.Value
+		parseAs starlark.String
 	)
 
-	if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "raw?", &raw, "data_format", &dataFormat); err != nil {
+	if err := starlark.UnpackArgs("set_body", args, kwargs, "data", &data, "parse_as?", &parseAs); err != nil {
 		return starlark.None, err
 	}
 
@@ -300,30 +299,26 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 		return starlark.None, err
 	}
 
-	df := dataFormat.GoString()
-	if df == "" {
-		// default to json
-		df = "json"
-	}
-
-	if _, err := dataset.ParseDataFormatString(df); err != nil {
-		return starlark.None, fmt.Errorf("invalid data_format: '%s'", df)
-	}
+	df := parseAs.GoString()
+	if df != "" {
+		if _, err := dataset.ParseDataFormatString(df); err != nil {
+			return starlark.None, fmt.Errorf("invalid parse_as format: '%s'", df)
+		}
 
-	if raw {
-		if str, ok := data.(starlark.String); ok {
-			d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str))))
-			d.modBody = true
-			d.bodyCache = nil
-			return starlark.None, nil
+		str, ok := data.(starlark.String)
+		if !ok {
+			return starlark.None, fmt.Errorf("expected data for '%s' format to be a string", df)
 		}
 
-		return starlark.None, fmt.Errorf("expected raw data for body to be a string")
+		d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str))))
+		d.modBody = true
+		d.bodyCache = nil
+		return starlark.None, nil
 	}
 
 	iter, ok := data.(starlark.Iterable)
 	if !ok {
-		return starlark.None, fmt.Errorf("expected body to be iterable")
+		return starlark.None, fmt.Errorf("expected body data to be iterable")
 	}
 
 	d.write.Structure = d.writeStructure(data)
diff --git a/ds/testdata/test.star b/ds/testdata/test.star
index 946321e..a1e3d7a 100644
--- a/ds/testdata/test.star
+++ b/ds/testdata/test.star
@@ -32,7 +32,7 @@ bd_obj = {'a': [1,2,3]}
 
 assert.eq(ds.set_body(bd_obj), None)
 assert.eq(ds.set_body(bd), None)
-assert.eq(ds.set_body("[[1,2,3]]", raw=True), None)
+assert.eq(ds.set_body("[[1,2,3]]", parse_as="json"), None)
 
 # TODO - haven't thought through this yet
 assert.eq(ds.get_body(), bd)

From 008ac8ccfdda85ae113842fb772c176bf9037530 Mon Sep 17 00:00:00 2001
From: b5 <sparkle_pony_2000@qri.io>
Date: Thu, 16 May 2019 15:00:38 -0400
Subject: [PATCH 3/4] docs(ds): add initial outline docs for ds

---
 ds/doc.go | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 ds/doc.go

diff --git a/ds/doc.go b/ds/doc.go
new file mode 100644
index 0000000..135ec01
--- /dev/null
+++ b/ds/doc.go
@@ -0,0 +1,28 @@
+/*Package ds defines the qri dataset object within starlark
+
+  outline: ds
+    ds defines the qri dataset object within starlark. it's loaded by default
+    in the qri runtime
+
+    types:
+      Dataset
+        a qri dataset. Datasets can be either read-only or read-write. By default datasets are read-write
+        methods:
+          set_meta(meta dict)
+            set dataset meta component
+          get_meta() dict|None
+            get dataset meta component
+          get_structure() dict|None
+            get dataset structure component if one is defined
+          set_structure(structure) structure
+            set dataset structure component
+          get_body() dict|list|None
+            get dataset body component if one is defined
+          set_body(data dict|list, parse_as? string) body
+            set dataset body component. set_body has only one optional argument: 'parse_as', which defaults to the
+            empty string. By default qri assumes the data value provided to set_body is an iterable starlark data
+            structure (tuple, set, list, dict). When parse_as is set, set_body assumes the provided body value will
+            be a string of serialized structured data in the given format. valid parse_as values are "json", "csv",
+            "cbor", "xlsx".
+*/
+package ds

From 349cafd5ed1b547858273f3f1414c6fb993a1ce9 Mon Sep 17 00:00:00 2001
From: b5 <sparkle_pony_2000@qri.io>
Date: Thu, 16 May 2019 16:35:14 -0400
Subject: [PATCH 4/4] chore(ds): update 'data' filenames to 'body'

---
 ds/dataset.go         | 6 +++---
 ds/dataset_test.go    | 2 +-
 ds/testdata/test.star | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ds/dataset.go b/ds/dataset.go
index 64b9976..fed1802 100644
--- a/ds/dataset.go
+++ b/ds/dataset.go
@@ -248,9 +248,9 @@ func (d *Dataset) GetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 	if err != nil {
 		return starlark.None, err
 	}
-	provider.SetBodyFile(qfs.NewMemfileBytes("data.json", data))
+	provider.SetBodyFile(qfs.NewMemfileBytes("body.json", data))
 
-	rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("data.json", data))
+	rr, err := dsio.NewEntryReader(provider.Structure, qfs.NewMemfileBytes("body.json", data))
 	if err != nil {
 		return starlark.None, fmt.Errorf("error allocating data reader: %s", err)
 	}
@@ -310,7 +310,7 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
 			return starlark.None, fmt.Errorf("expected data for '%s' format to be a string", df)
 		}
 
-		d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("data.%s", df), []byte(string(str))))
+		d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", df), []byte(string(str))))
 		d.modBody = true
 		d.bodyCache = nil
 		return starlark.None, nil
diff --git a/ds/dataset_test.go b/ds/dataset_test.go
index de1a89e..5be0ecf 100644
--- a/ds/dataset_test.go
+++ b/ds/dataset_test.go
@@ -228,7 +228,7 @@ bat,3,meh
 			},
 		},
 	}
-	ds.SetBodyFile(qfs.NewMemfileBytes("data.csv", []byte(text)))
+	ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(text)))
 
 	d := NewDataset(ds, nil)
 	d.SetMutable(&dataset.Dataset{
diff --git a/ds/testdata/test.star b/ds/testdata/test.star
index a1e3d7a..3ff66b0 100644
--- a/ds/testdata/test.star
+++ b/ds/testdata/test.star
@@ -38,11 +38,11 @@ assert.eq(ds.set_body("[[1,2,3]]", parse_as="json"), None)
 assert.eq(ds.get_body(), bd)
 
 # csv_ds is a global variable provided by dataset_test.go
-# "cycling" csv data through starlark shouldn't have significant effects on the 
+# round-tripping csv data through starlark shouldn't have significant effects on the 
 # encoded data. whitespace is *not* significant.
 # csv data is one of the harder formats, where there header row must be preserved
 csv_ds.set_body(csv_ds.get_body())
 
 expect_data = [["foo",1,"true"], ["bar",2,"false"], ["bat",3,"meh"]]
 assert.eq(expect_data, csv_ds.get_body())
-assert.eq(csv_ds.get_structure()['format'], 'csv')
\ No newline at end of file
+assert.eq(csv_ds.get_structure()['format'], 'csv')