Skip to content

Commit

Permalink
Correctly update codepoint indices during decoding (#62)
Browse files Browse the repository at this point in the history
- Fix decoding issue where codepoint indices were incorrect after trimming data
- Fix decoding issue and potential panic where codepoint indices were not recalculated while reading a partial line
  • Loading branch information
sidkurella authored Feb 8, 2024
1 parent 48332ed commit 66e7247
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 16 deletions.
49 changes: 49 additions & 0 deletions buff.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package fixedwidth
import (
"bytes"
"errors"
"strings"
"unicode/utf8"
)

Expand Down Expand Up @@ -183,6 +184,54 @@ type rawValue struct {
codepointIndices []int
}

func (r rawValue) trimLeft(cutset string) rawValue {
newData := strings.TrimLeft(r.data, cutset)
leftRemovedBytes := len(r.data) - len(newData)

if r.codepointIndices == nil {
return rawValue{data: newData}
}

newIndices := r.trimCodepointIndices(leftRemovedBytes, 0)
return rawValue{data: newData, codepointIndices: newIndices}
}

func (r rawValue) trimRight(cutset string) rawValue {
newData := strings.TrimRight(r.data, cutset)
rightRemovedBytes := len(r.data) - len(newData)

if r.codepointIndices == nil {
return rawValue{data: newData}
}

newIndices := r.trimCodepointIndices(0, rightRemovedBytes)
return rawValue{data: newData, codepointIndices: newIndices}
}

func (r rawValue) trim(cutset string) rawValue {
leftTrimmed := strings.TrimLeft(r.data, cutset)
leftRemovedBytes := len(r.data) - len(leftTrimmed)
bothTrimmed := strings.TrimRight(leftTrimmed, cutset)
rightRemovedBytes := len(leftTrimmed) - len(bothTrimmed)

if r.codepointIndices == nil {
return rawValue{data: bothTrimmed}
}

newIndices := r.trimCodepointIndices(leftRemovedBytes, rightRemovedBytes)
return rawValue{data: bothTrimmed, codepointIndices: newIndices}
}

func (r rawValue) trimCodepointIndices(leftRemovedBytes int, rightRemovedBytes int) []int {
newIndices := make([]int, 0, len(r.codepointIndices))
for _, idx := range r.codepointIndices {
if idx >= leftRemovedBytes && idx < len(r.data)-rightRemovedBytes {
newIndices = append(newIndices, idx-leftRemovedBytes)
}
}
return newIndices
}

func newRawValue(data string, useCodepointIndices bool) (rawValue, error) {
value := rawValue{
data: data,
Expand Down
38 changes: 22 additions & 16 deletions decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"io"
"reflect"
"strconv"
"strings"
)

var (
Expand Down Expand Up @@ -197,20 +196,20 @@ func (d *Decoder) readLine(v reflect.Value) (err error, ok bool) {
}

func rawValueFromLine(value rawValue, startPos, endPos int, format format) rawValue {
var trimFunc func(string) string
var trimFunc func(r rawValue) rawValue

switch format.alignment {
case left:
trimFunc = func(s string) string {
return strings.TrimRight(s, string(format.padChar))
case left: // Aligned left, so trim from right side.
trimFunc = func(r rawValue) rawValue {
return r.trimRight(string(format.padChar))
}
case right:
trimFunc = func(s string) string {
return strings.TrimLeft(s, string(format.padChar))
case right: // Aligned right, so trim from left side.
trimFunc = func(r rawValue) rawValue {
return r.trimLeft(string(format.padChar))
}
default:
trimFunc = func(s string) string {
return strings.Trim(s, string(format.padChar))
trimFunc = func(r rawValue) rawValue {
return r.trim(string(format.padChar))
}
}

Expand All @@ -227,20 +226,27 @@ func rawValueFromLine(value rawValue, startPos, endPos int, format format) rawVa
relevantIndices = value.codepointIndices[startPos-1 : endPos]
lineData = value.data[relevantIndices[0]:value.codepointIndices[endPos]]
}
return rawValue{
data: trimFunc(lineData),
codepointIndices: relevantIndices,

newIndices := relevantIndices
if relevantIndices[0] > 0 {
// We trimmed data from the front of the string.
// We need to adjust the codepoint indices to reflect this, as they have shifted.
removedFromFront := relevantIndices[0]
newIndices = make([]int, 0, len(relevantIndices))
for _, idx := range relevantIndices {
newIndices = append(newIndices, idx-removedFromFront)
}
}

return trimFunc(rawValue{data: lineData, codepointIndices: newIndices})
} else {
if len(value.data) == 0 || startPos > len(value.data) {
return rawValue{data: ""}
}
if endPos > len(value.data) {
endPos = len(value.data)
}
return rawValue{
data: trimFunc(value.data[startPos-1 : endPos]),
}
return trimFunc(rawValue{data: value.data[startPos-1 : endPos]})
}
}

Expand Down
116 changes: 116 additions & 0 deletions decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,122 @@ func TestDecodeSetUseCodepointIndices(t *testing.T) {

}

func TestDecodeSetUseCodepointIndices_Nested(t *testing.T) {
type Nested struct {
First string `fixed:"1,3"`
Second string `fixed:"4,6"`
}

type Test struct {
First string `fixed:"1,3"`
Second Nested `fixed:"4,9"`
Third string `fixed:"10,12"`
Fourth Nested `fixed:"13,18"`
Fifth string `fixed:"19,21"`
}

for _, tt := range []struct {
name string
raw []byte
expected Test
}{
{
name: "All ASCII characters",
raw: []byte("123ABC456DEF789GHI012\n"),
expected: Test{
First: "123",
Second: Nested{First: "ABC", Second: "456"},
Third: "DEF",
Fourth: Nested{First: "789", Second: "GHI"},
Fifth: "012",
},
},
{
name: "Multi-byte characters",
raw: []byte("123x☃x456x☃x789x☃x012\n"),
expected: Test{
First: "123",
Second: Nested{First: "x☃x", Second: "456"},
Third: "x☃x",
Fourth: Nested{First: "789", Second: "x☃x"},
Fifth: "012",
},
},
} {
t.Run(tt.name, func(t *testing.T) {
d := NewDecoder(bytes.NewReader(tt.raw))
d.SetUseCodepointIndices(true)
var s Test
err := d.Decode(&s)
if err != nil {
t.Errorf("Unexpected err: %v", err)
}
if !reflect.DeepEqual(tt.expected, s) {
t.Errorf("Decode(%v) want %v, have %v", tt.raw, tt.expected, s)
}
})
}
}

func TestDecodeSetUseCodepointIndices_PaddingTrimmed(t *testing.T) {
type Nested struct {
First int64 `fixed:"1,2,right,0"`
Second string `fixed:"3,4"`
Third string `fixed:"5,6"`
Fourth string `fixed:"7,8"`
}
type Test struct {
First Nested `fixed:"1,8"`
Second string `fixed:"9,10"`
}

for _, tt := range []struct {
name string
raw []byte
expected Test
}{
{
name: "All ASCII characters",
raw: []byte("00 11"),
expected: Test{
First: Nested{
First: 0,
Second: "",
Third: "",
Fourth: "",
},
Second: "11",
},
},
{
name: "Multi-byte characters",
raw: []byte("00 ☃☃"),
expected: Test{
First: Nested{
First: 0,
Second: "",
Third: "",
Fourth: "",
},
Second: "☃☃",
},
},
} {
t.Run(tt.name, func(t *testing.T) {
d := NewDecoder(bytes.NewReader(tt.raw))
d.SetUseCodepointIndices(true)
var s Test
err := d.Decode(&s)
if err != nil {
t.Errorf("Unexpected err: %v", err)
}
if !reflect.DeepEqual(tt.expected, s) {
t.Errorf("Decode(%v) want %v, have %v", tt.raw, tt.expected, s)
}
})
}
}

// Verify the behavior of Decoder.Decode at the end of a file. See
// https://github.com/ianlopshire/go-fixedwidth/issues/6 for more details.
func TestDecode_EOF(t *testing.T) {
Expand Down

0 comments on commit 66e7247

Please sign in to comment.