diff --git a/Data/ByteString.hs b/Data/ByteString.hs index 2d8b337c5..071b9b0d9 100644 --- a/Data/ByteString.hs +++ b/Data/ByteString.hs @@ -1816,31 +1816,48 @@ hGetLine h = else ioe_EOF else haveBuf h_ buf' len xss - haveBuf h_@Handle__{haByteBuffer} + haveBuf h_@Handle__{haByteBuffer, haInputNL} buf@Buffer{ bufRaw=raw, bufR=w, bufL=r } len xss = do - off <- findEOL r w raw + (off, sizeNewline) <- findEOL haInputNL r w raw let new_len = len + off - r xs <- mkPS raw r off -- if eol == True, then off is the offset of the '\n' -- otherwise off == w and the buffer is now empty. if off /= w - then do if w == off + 1 - then writeIORef haByteBuffer buf{ bufL=0, bufR=0 } - else writeIORef haByteBuffer buf{ bufL = off + 1 } - mkBigPS new_len (xs:xss) + then do + -- If off + sizeNewline == w then the remaining buffer is empty + if (off + sizeNewline) == w + then writeIORef haByteBuffer buf{ bufL=0, bufR=0 } + else writeIORef haByteBuffer buf{ bufL = off + sizeNewline } + mkBigPS new_len (xs:xss) else fill h_ buf{ bufL=0, bufR=0 } new_len (xs:xss) -- find the end-of-line character, if there is one - findEOL r w raw - | r == w = return w + findEOL haInputNL r w raw + | r == w = return (w, 0) | otherwise = do c <- readWord8Buf raw r if c == fromIntegral (ord '\n') - then return r -- NB. not r+1: don't include the '\n' - else findEOL (r+1) w raw + then do + -- NB. not r+1: don't include the '\n' + -- Also, it is important that it ends the line in both modes + -- To match System.IO.hGetLine's behavior + return (r, 1) + else if haInputNL == CRLF && c == fromIntegral (ord '\r') && r+1 < w + then do + c' <- readWord8Buf raw (r+1) + if c' == fromIntegral (ord '\n') + then return (r, 2) -- NB. not r+1 or r+2: don't include the '\r\n' + else do + -- We cannot jump 2 characters ahead + -- because if we encountered '\r\r\n' + -- We would miss the pattern starting on the second '\r' + findEOL haInputNL (r+1) w raw + else findEOL haInputNL (r+1) w raw + mkPS :: RawBuffer Word8 -> Int -> Int -> IO ByteString mkPS buf start end = diff --git a/tests/HGetLine.hs b/tests/HGetLine.hs new file mode 100644 index 000000000..2607165fa --- /dev/null +++ b/tests/HGetLine.hs @@ -0,0 +1,105 @@ +{-# LANGUAGE BangPatterns, ScopedTypeVariables #-} +import qualified Data.ByteString.Char8 as S8 + + +import Test.HUnit (assertEqual, assertBool) +import qualified Test.Framework as F +import qualified Test.Framework.Providers.HUnit as F + +import Control.Monad +import System.IO + + +testfile_eof :: FilePath +testfile_eof = "line-endings_eof.txt" + +testfile_lf_eof :: FilePath +testfile_lf_eof = "line-endings_lf_eof.txt" + +testfile_cr_eof :: FilePath +testfile_cr_eof = "line-endings_cr_eof.txt" + +testfile_crlf_eof :: FilePath +testfile_crlf_eof = "line-endings_crlf_eof.txt" + + +testString :: String +testString = concat [ + "This file\r\n" + , "tests how hGetLine\r\r\n" + , " handles\n" + , " newlines.\r\n" + , "It is intentionally\n\r" + , "inconsistent\r\n\r\n \r\n" + , "in how it\r" + , "handles newlines.\r\n\n" + , "If it was in the git repo\n\n" + , "it would need to be marked in\r\r" + , "in .gitattributes as a binary file\n" + , "to stop\n\r\r\n" + , "git from changing its endings" + ] + +writeTestFiles :: IO () +writeTestFiles = do + writeFile testfile_eof $ testString + writeFile testfile_lf_eof $ testString <> "\n" + writeFile testfile_cr_eof $ testString <> "\r" + writeFile testfile_crlf_eof $ testString <> "\r\n" + + +readByLinesBS :: Handle -> IO [S8.ByteString] +readByLinesBS h_ = go [] + where + go lines = do + isEnd <- hIsEOF h_ + if isEnd + then return $ reverse lines + else do + !nextLine <- S8.hGetLine h_ + go (nextLine : lines) + +readByLinesS :: Handle -> IO [String] +readByLinesS h_ = go [] + where + go lines = do + isEnd <- hIsEOF h_ + if isEnd + then return $ reverse lines + else do + !nextLine <- hGetLine h_ + go (nextLine : lines) + +hgetline_like_s8_hgetline :: IO () +hgetline_like_s8_hgetline = + mapM_ (uncurry hgetline_like_s8_hgetline') $ do + file <- [testfile_eof, testfile_lf_eof, testfile_cr_eof, testfile_crlf_eof] + linemode <- [NewlineMode LF LF, NewlineMode CRLF CRLF, NewlineMode LF CRLF, NewlineMode CRLF LF] + return (file, linemode) + + +hgetline_like_s8_hgetline' :: FilePath -> NewlineMode -> IO () +hgetline_like_s8_hgetline' file newlineMode = do + bsLines <- withFile file ReadMode (\h -> do + hSetNewlineMode h newlineMode + readByLinesBS h + ) + sLines <- withFile file ReadMode (\h -> do + hSetNewlineMode h newlineMode + readByLinesS h + ) + assertEqual ("unpacking S8.hGetLines should equal hGetLines for newlineMode " <> show newlineMode <> " for file " <> file) + (map S8.unpack bsLines) + sLines + assertBool "The test file for hGetLine sshould not be empty" $ bsLines /= [] + + +tests :: [F.Test] +tests = [ + F.testCase "hgetline_like_s8_hgetline" hgetline_like_s8_hgetline + ] + +main :: IO () +main = do + writeTestFiles + F.defaultMain tests diff --git a/tests/bytestring-tests.cabal b/tests/bytestring-tests.cabal index 6f4dd55d2..b3720fb70 100644 --- a/tests/bytestring-tests.cabal +++ b/tests/bytestring-tests.cabal @@ -71,6 +71,27 @@ test-suite lazy-hclose -fno-enable-rewrite-rules -threaded -rtsopts +test-suite hgetline + type: exitcode-stdio-1.0 + main-is: HGetLine.hs + other-modules: Data.ByteString + Data.ByteString.Internal + Data.ByteString.Unsafe + hs-source-dirs: . .. + build-depends: base, ghc-prim, deepseq, random, directory, + test-framework, test-framework-hunit, HUnit + c-sources: ../cbits/fpstring.c + include-dirs: ../include + cpp-options: -DHAVE_TEST_FRAMEWORK=1 + ghc-options: -fwarn-unused-binds + -fno-enable-rewrite-rules + -threaded -rtsopts + extensions: BangPatterns + UnliftedFFITypes, + MagicHash, + ScopedTypeVariables + NamedFieldPuns + executable regressions main-is: Regressions.hs other-modules: Data.ByteString