Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly update and trim codepoint indices after trimming data #62

Merged
merged 6 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions buff.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package fixedwidth
import (
"bytes"
"errors"
"strings"
"unicode/utf8"
)

Expand Down Expand Up @@ -183,6 +184,54 @@ type rawValue struct {
codepointIndices []int
}

func (r rawValue) trimLeft(cutset string) rawValue {
newData := strings.TrimLeft(r.data, cutset)
leftRemovedBytes := len(r.data) - len(newData)

if r.codepointIndices == nil {
return rawValue{data: newData}
}

newIndices := r.trimCodepointIndices(leftRemovedBytes, 0)
return rawValue{data: newData, codepointIndices: newIndices}
}

func (r rawValue) trimRight(cutset string) rawValue {
newData := strings.TrimRight(r.data, cutset)
rightRemovedBytes := len(r.data) - len(newData)

if r.codepointIndices == nil {
return rawValue{data: newData}
}

newIndices := r.trimCodepointIndices(0, rightRemovedBytes)
return rawValue{data: newData, codepointIndices: newIndices}
}

func (r rawValue) trim(cutset string) rawValue {
leftTrimmed := strings.TrimLeft(r.data, cutset)
leftRemovedBytes := len(r.data) - len(leftTrimmed)
bothTrimmed := strings.TrimRight(leftTrimmed, cutset)
rightRemovedBytes := len(leftTrimmed) - len(bothTrimmed)

if r.codepointIndices == nil {
return rawValue{data: bothTrimmed}
}

newIndices := r.trimCodepointIndices(leftRemovedBytes, rightRemovedBytes)
return rawValue{data: bothTrimmed, codepointIndices: newIndices}
}

func (r rawValue) trimCodepointIndices(leftRemovedBytes int, rightRemovedBytes int) []int {
newIndices := make([]int, 0, len(r.codepointIndices))
for _, idx := range r.codepointIndices {
if idx >= leftRemovedBytes && idx < len(r.data)-rightRemovedBytes {
newIndices = append(newIndices, idx-leftRemovedBytes)
}
}
return newIndices
}

func newRawValue(data string, useCodepointIndices bool) (rawValue, error) {
value := rawValue{
data: data,
Expand Down
38 changes: 22 additions & 16 deletions decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"io"
"reflect"
"strconv"
"strings"
)

var (
Expand Down Expand Up @@ -197,20 +196,20 @@ func (d *Decoder) readLine(v reflect.Value) (err error, ok bool) {
}

func rawValueFromLine(value rawValue, startPos, endPos int, format format) rawValue {
var trimFunc func(string) string
var trimFunc func(r rawValue) rawValue

switch format.alignment {
case left:
trimFunc = func(s string) string {
return strings.TrimRight(s, string(format.padChar))
case left: // Aligned left, so trim from right side.
trimFunc = func(r rawValue) rawValue {
return r.trimRight(string(format.padChar))
}
case right:
trimFunc = func(s string) string {
return strings.TrimLeft(s, string(format.padChar))
case right: // Aligned right, so trim from left side.
trimFunc = func(r rawValue) rawValue {
return r.trimLeft(string(format.padChar))
}
default:
trimFunc = func(s string) string {
return strings.Trim(s, string(format.padChar))
trimFunc = func(r rawValue) rawValue {
return r.trim(string(format.padChar))
}
}

Expand All @@ -227,20 +226,27 @@ func rawValueFromLine(value rawValue, startPos, endPos int, format format) rawVa
relevantIndices = value.codepointIndices[startPos-1 : endPos]
lineData = value.data[relevantIndices[0]:value.codepointIndices[endPos]]
}
return rawValue{
data: trimFunc(lineData),
codepointIndices: relevantIndices,

newIndices := relevantIndices
if relevantIndices[0] > 0 {
// We trimmed data from the front of the string.
// We need to adjust the codepoint indices to reflect this, as they have shifted.
removedFromFront := relevantIndices[0]
newIndices = make([]int, 0, len(relevantIndices))
for _, idx := range relevantIndices {
newIndices = append(newIndices, idx-removedFromFront)
}
}

return trimFunc(rawValue{data: lineData, codepointIndices: newIndices})
} else {
if len(value.data) == 0 || startPos > len(value.data) {
return rawValue{data: ""}
}
if endPos > len(value.data) {
endPos = len(value.data)
}
return rawValue{
data: trimFunc(value.data[startPos-1 : endPos]),
}
return trimFunc(rawValue{data: value.data[startPos-1 : endPos]})
}
}

Expand Down
116 changes: 116 additions & 0 deletions decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,122 @@ func TestDecodeSetUseCodepointIndices(t *testing.T) {

}

func TestDecodeSetUseCodepointIndices_Nested(t *testing.T) {
type Nested struct {
First string `fixed:"1,3"`
Second string `fixed:"4,6"`
}

type Test struct {
First string `fixed:"1,3"`
Second Nested `fixed:"4,9"`
Third string `fixed:"10,12"`
Fourth Nested `fixed:"13,18"`
Fifth string `fixed:"19,21"`
}

for _, tt := range []struct {
name string
raw []byte
expected Test
}{
{
name: "All ASCII characters",
raw: []byte("123ABC456DEF789GHI012\n"),
expected: Test{
First: "123",
Second: Nested{First: "ABC", Second: "456"},
Third: "DEF",
Fourth: Nested{First: "789", Second: "GHI"},
Fifth: "012",
},
},
{
name: "Multi-byte characters",
raw: []byte("123x☃x456x☃x789x☃x012\n"),
expected: Test{
First: "123",
Second: Nested{First: "x☃x", Second: "456"},
Third: "x☃x",
Fourth: Nested{First: "789", Second: "x☃x"},
Fifth: "012",
},
},
} {
t.Run(tt.name, func(t *testing.T) {
d := NewDecoder(bytes.NewReader(tt.raw))
d.SetUseCodepointIndices(true)
var s Test
err := d.Decode(&s)
if err != nil {
t.Errorf("Unexpected err: %v", err)
}
if !reflect.DeepEqual(tt.expected, s) {
t.Errorf("Decode(%v) want %v, have %v", tt.raw, tt.expected, s)
}
})
}
}

func TestDecodeSetUseCodepointIndices_PaddingTrimmed(t *testing.T) {
type Nested struct {
First int64 `fixed:"1,2,right,0"`
Second string `fixed:"3,4"`
Third string `fixed:"5,6"`
Fourth string `fixed:"7,8"`
}
type Test struct {
First Nested `fixed:"1,8"`
Second string `fixed:"9,10"`
}

for _, tt := range []struct {
name string
raw []byte
expected Test
}{
{
name: "All ASCII characters",
raw: []byte("00 11"),
expected: Test{
First: Nested{
First: 0,
Second: "",
Third: "",
Fourth: "",
},
Second: "11",
},
},
{
name: "Multi-byte characters",
raw: []byte("00 ☃☃"),
expected: Test{
First: Nested{
First: 0,
Second: "",
Third: "",
Fourth: "",
},
Second: "☃☃",
},
},
} {
t.Run(tt.name, func(t *testing.T) {
d := NewDecoder(bytes.NewReader(tt.raw))
d.SetUseCodepointIndices(true)
var s Test
err := d.Decode(&s)
if err != nil {
t.Errorf("Unexpected err: %v", err)
}
if !reflect.DeepEqual(tt.expected, s) {
t.Errorf("Decode(%v) want %v, have %v", tt.raw, tt.expected, s)
}
})
}
}

// Verify the behavior of Decoder.Decode at the end of a file. See
// https://github.com/ianlopshire/go-fixedwidth/issues/6 for more details.
func TestDecode_EOF(t *testing.T) {
Expand Down
Loading