Skip to content

Commit

Permalink
Made BS.hGetLine's behavior like hGetLine.
Browse files Browse the repository at this point in the history
This closes issue haskell#13.

The changes can be summarized as
updating `findEOL` to look for "\r\n" in CRLF mode
and updating the logic of `haveBuf` to resize the buffer
according to the size of the newline.

Additionally, tests were added to verify that both
`hGetLine`s produce the same behavior.

Some of the edge-cases to worry about here include

* '\n' still counts as a line end.

    Thus line endings' length vary between 1 and 2 in CRLF mode.
* "\r\r\n" can give a false-start.

    This means you can't always skip 2 characters when `c' /= '\n'`.
* '\r' when not followed by '\n' isn't part of a newline.
* Not reading out of the buffer when '\r' is the last character.
  • Loading branch information
dbramucci committed Oct 28, 2020
1 parent 82ea134 commit 5b169ec
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 10 deletions.
37 changes: 27 additions & 10 deletions Data/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1816,31 +1816,48 @@ hGetLine h =
else ioe_EOF
else haveBuf h_ buf' len xss

haveBuf h_@Handle__{haByteBuffer}
haveBuf h_@Handle__{haByteBuffer, haInputNL}
buf@Buffer{ bufRaw=raw, bufR=w, bufL=r }
len xss =
do
off <- findEOL r w raw
(off, sizeNewline) <- findEOL haInputNL r w raw
let new_len = len + off - r
xs <- mkPS raw r off

-- if eol == True, then off is the offset of the '\n'
-- otherwise off == w and the buffer is now empty.
if off /= w
then do if w == off + 1
then writeIORef haByteBuffer buf{ bufL=0, bufR=0 }
else writeIORef haByteBuffer buf{ bufL = off + 1 }
mkBigPS new_len (xs:xss)
then do
-- If off + sizeNewline == w then the remaining buffer is empty
if (off + sizeNewline) == w
then writeIORef haByteBuffer buf{ bufL=0, bufR=0 }
else writeIORef haByteBuffer buf{ bufL = off + sizeNewline }
mkBigPS new_len (xs:xss)
else fill h_ buf{ bufL=0, bufR=0 } new_len (xs:xss)

-- find the end-of-line character, if there is one
findEOL r w raw
| r == w = return w
findEOL haInputNL r w raw
| r == w = return (w, 0)
| otherwise = do
c <- readWord8Buf raw r
if c == fromIntegral (ord '\n')
then return r -- NB. not r+1: don't include the '\n'
else findEOL (r+1) w raw
then do
-- NB. not r+1: don't include the '\n'
-- Also, it is important that it ends the line in both modes
-- To match System.IO.hGetLine's behavior
return (r, 1)
else if haInputNL == CRLF && c == fromIntegral (ord '\r') && r+1 < w
then do
c' <- readWord8Buf raw (r+1)
if c' == fromIntegral (ord '\n')
then return (r, 2) -- NB. not r+1 or r+2: don't include the '\r\n'
else do
-- We cannot jump 2 characters ahead
-- because if we encountered '\r\r\n'
-- We would miss the pattern starting on the second '\r'
findEOL haInputNL (r+1) w raw
else findEOL haInputNL (r+1) w raw


mkPS :: RawBuffer Word8 -> Int -> Int -> IO ByteString
mkPS buf start end =
Expand Down
105 changes: 105 additions & 0 deletions tests/HGetLine.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
import qualified Data.ByteString.Char8 as S8


import Test.HUnit (assertEqual, assertBool)
import qualified Test.Framework as F
import qualified Test.Framework.Providers.HUnit as F

import Control.Monad
import System.IO


testfile_eof :: FilePath
testfile_eof = "line-endings_eof.txt"

testfile_lf_eof :: FilePath
testfile_lf_eof = "line-endings_lf_eof.txt"

testfile_cr_eof :: FilePath
testfile_cr_eof = "line-endings_cr_eof.txt"

testfile_crlf_eof :: FilePath
testfile_crlf_eof = "line-endings_crlf_eof.txt"


testString :: String
testString = concat [
"This file\r\n"
, "tests how hGetLine\r\r\n"
, " handles\n"
, " newlines.\r\n"
, "It is intentionally\n\r"
, "inconsistent\r\n\r\n \r\n"
, "in how it\r"
, "handles newlines.\r\n\n"
, "If it was in the git repo\n\n"
, "it would need to be marked in\r\r"
, "in .gitattributes as a binary file\n"
, "to stop\n\r\r\n"
, "git from changing its endings"
]

writeTestFiles :: IO ()
writeTestFiles = do
writeFile testfile_eof $ testString
writeFile testfile_lf_eof $ testString <> "\n"
writeFile testfile_cr_eof $ testString <> "\r"
writeFile testfile_crlf_eof $ testString <> "\r\n"


readByLinesBS :: Handle -> IO [S8.ByteString]
readByLinesBS h_ = go []
where
go lines = do
isEnd <- hIsEOF h_
if isEnd
then return $ reverse lines
else do
!nextLine <- S8.hGetLine h_
go (nextLine : lines)

readByLinesS :: Handle -> IO [String]
readByLinesS h_ = go []
where
go lines = do
isEnd <- hIsEOF h_
if isEnd
then return $ reverse lines
else do
!nextLine <- hGetLine h_
go (nextLine : lines)

hgetline_like_s8_hgetline :: IO ()
hgetline_like_s8_hgetline =
mapM_ (uncurry hgetline_like_s8_hgetline') $ do
file <- [testfile_eof, testfile_lf_eof, testfile_cr_eof, testfile_crlf_eof]
linemode <- [NewlineMode LF LF, NewlineMode CRLF CRLF, NewlineMode LF CRLF, NewlineMode CRLF LF]
return (file, linemode)


hgetline_like_s8_hgetline' :: FilePath -> NewlineMode -> IO ()
hgetline_like_s8_hgetline' file newlineMode = do
bsLines <- withFile file ReadMode (\h -> do
hSetNewlineMode h newlineMode
readByLinesBS h
)
sLines <- withFile file ReadMode (\h -> do
hSetNewlineMode h newlineMode
readByLinesS h
)
assertEqual ("unpacking S8.hGetLines should equal hGetLines for newlineMode " <> show newlineMode <> " for file " <> file)
(map S8.unpack bsLines)
sLines
assertBool "The test file for hGetLine sshould not be empty" $ bsLines /= []


tests :: [F.Test]
tests = [
F.testCase "hgetline_like_s8_hgetline" hgetline_like_s8_hgetline
]

main :: IO ()
main = do
writeTestFiles
F.defaultMain tests
21 changes: 21 additions & 0 deletions tests/bytestring-tests.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,27 @@ test-suite lazy-hclose
-fno-enable-rewrite-rules
-threaded -rtsopts

test-suite hgetline
type: exitcode-stdio-1.0
main-is: HGetLine.hs
other-modules: Data.ByteString
Data.ByteString.Internal
Data.ByteString.Unsafe
hs-source-dirs: . ..
build-depends: base, ghc-prim, deepseq, random, directory,
test-framework, test-framework-hunit, HUnit
c-sources: ../cbits/fpstring.c
include-dirs: ../include
cpp-options: -DHAVE_TEST_FRAMEWORK=1
ghc-options: -fwarn-unused-binds
-fno-enable-rewrite-rules
-threaded -rtsopts
extensions: BangPatterns
UnliftedFFITypes,
MagicHash,
ScopedTypeVariables
NamedFieldPuns

executable regressions
main-is: Regressions.hs
other-modules: Data.ByteString
Expand Down

0 comments on commit 5b169ec

Please sign in to comment.