Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More informative consistency checks #283

Merged
merged 17 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 118 additions & 52 deletions src/Poseidon/Janno.hs
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,23 @@
decodingOptions,
explicitNA,
removeUselessSuffix,
parseCsvParseError,
renderCsvParseError,
getMaybeJannoList
) where

import Poseidon.Utils (PoseidonException (..),
PoseidonIO, logDebug,
logError,
logError, logWarning,
renderPoseidonException)


import Control.Applicative (empty)
import Control.Exception (throwIO)
import Control.Monad (unless, when)
import qualified Control.Monad.Except as E
nevrome marked this conversation as resolved.
Show resolved Hide resolved
import Control.Monad.IO.Class (liftIO)
import qualified Control.Monad.Writer as W
import Country (Country, alphaTwoUpper,
decodeAlphaTwo)
import Data.Aeson (FromJSON, Options (..),
Expand All @@ -66,14 +70,14 @@
import Data.Bifunctor (second)
import qualified Data.ByteString.Char8 as Bchs
import qualified Data.ByteString.Lazy.Char8 as Bch
import Data.Char (isSpace, ord)
import Data.Char (chr, isSpace, ord)
import qualified Data.Csv as Csv
import Data.Either (lefts, rights)
import qualified Data.HashMap.Strict as HM
import Data.List (elemIndex, foldl',
intercalate, nub, sort,
(\\))
import Data.Maybe (fromJust, isNothing)
import Data.Maybe (fromJust)
import Data.Text (pack, replace, unpack)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
Expand All @@ -84,6 +88,8 @@
import Options.Applicative.Help.Levenshtein (editDistance)
import SequenceFormats.Eigenstrat (EigenstratIndEntry (..),
Sex (..))
import qualified Text.Parsec as P
import qualified Text.Parsec.String as P
import qualified Text.Regex.TDFA as Reg

-- | A datatype for genetic sex
Expand Down Expand Up @@ -996,15 +1002,27 @@
-- | A function to load one row of a janno file
readJannoFileRow :: FilePath -> (Int, Bch.ByteString) -> PoseidonIO (Either PoseidonException JannoRow)
readJannoFileRow jannoPath (lineNumber, row) = do
case Csv.decodeByNameWith decodingOptions row of
let decoded = Csv.decodeByNameWith decodingOptions row
simplifiedDecoded = (\(_,rs) -> V.head rs) <$> decoded
case simplifiedDecoded of
Left e -> do
return $ Left $ PoseidonFileRowException jannoPath lineNumber $ removeUselessSuffix e
Right (_, jannoRow :: V.Vector JannoRow) -> do
case checkJannoRowConsistency jannoPath lineNumber $ V.head jannoRow of
Left e -> do
return $ Left e
Right (pS :: JannoRow) -> do
return $ Right pS
let betterError = case P.parse parseCsvParseError "" e of
Left _ -> removeUselessSuffix e
Right result -> renderCsvParseError result

Check warning on line 1011 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1009-L1011

Added lines #L1009 - L1011 were not covered by tests
return $ Left $ PoseidonFileRowException jannoPath (show lineNumber) betterError
Right jannoRow -> do
let (errOrJannoRow, warnings) = W.runWriter (E.runExceptT (checkJannoRowConsistency jannoRow))
mapM_ (logWarning . renderWarning) warnings
case errOrJannoRow of
Left e -> return $ Left $ PoseidonFileRowException jannoPath renderLocation e
Right r -> return $ Right r
where
renderWarning :: String -> String
renderWarning e = "Issue in " ++ jannoPath ++ " " ++
"in line " ++ renderLocation ++ ": " ++ e

Check warning on line 1022 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1021-L1022

Added lines #L1021 - L1022 were not covered by tests
renderLocation :: String
renderLocation = show lineNumber ++
" (Poseidon_ID: " ++ jPoseidonID jannoRow ++ ")"

Check warning on line 1025 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1024-L1025

Added lines #L1024 - L1025 were not covered by tests

decodingOptions :: Csv.DecodeOptions
decodingOptions = Csv.defaultDecodeOptions {
Expand All @@ -1014,6 +1032,33 @@
removeUselessSuffix :: String -> String
removeUselessSuffix = unpack . replace " at \"\"" "" . pack

-- reformat the parser error
-- from: parse error (Failed reading: conversion error: expected Int, got "430;" (incomplete field parse, leftover: [59]))
-- to: parse error in one column (expected data type: Int, broken value: "430;", problematic characters: ";")

data CsvParseError = CsvParseError {
_expected :: String
, _actual :: String
, _leftover :: String
} deriving Show

Check warning on line 1043 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1040-L1043

Added lines #L1040 - L1043 were not covered by tests

parseCsvParseError :: P.Parser CsvParseError
parseCsvParseError = do
_ <- P.string "parse error (Failed reading: conversion error: expected "
expected <- P.manyTill P.anyChar (P.try (P.string ", got "))
actual <- P.manyTill P.anyChar (P.try (P.string " (incomplete field parse, leftover: ["))
leftoverList <- P.sepBy (read <$> P.many1 P.digit) (P.char ',')
_ <- P.char ']'
_ <- P.many P.anyChar
return $ CsvParseError expected actual (map chr leftoverList)

Check warning on line 1053 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1046-L1053

Added lines #L1046 - L1053 were not covered by tests

renderCsvParseError :: CsvParseError -> String
renderCsvParseError (CsvParseError expected actual leftover) =
"parse error in one column (" ++
"expected data type: " ++ expected ++ ", " ++
"broken value: " ++ actual ++ ", " ++
"problematic characters: " ++ show leftover ++ ")"

Check warning on line 1060 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1056-L1060

Added lines #L1056 - L1060 were not covered by tests

-- | A helper functions to replace empty bytestrings values in janno files with explicit "n/a"
explicitNA :: Bch.ByteString -> Bch.ByteString
explicitNA = replaceInJannoBytestring Bch.empty "n/a"
Expand All @@ -1026,7 +1071,7 @@
tsvRowsUpdated = map (Bch.intercalate (Bch.pack "\t")) tsvCellsUpdated
in Bch.unlines tsvRowsUpdated

-- Janno consistency checks
-- Global janno consistency checks

checkJannoConsistency :: FilePath -> JannoRows -> Either PoseidonException JannoRows
checkJannoConsistency jannoPath xs
Expand All @@ -1037,23 +1082,27 @@
checkIndividualUnique :: JannoRows -> Bool
checkIndividualUnique (JannoRows rows) = length rows == length (nub $ map jPoseidonID rows)

checkJannoRowConsistency :: FilePath -> Int -> JannoRow -> Either PoseidonException JannoRow
checkJannoRowConsistency jannoPath row x
| not $ checkMandatoryStringNotEmpty x = Left $ PoseidonFileRowException jannoPath row
"The mandatory columns Poseidon_ID and Group_Name contain empty values"
| not $ checkC14ColsConsistent x = Left $ PoseidonFileRowException jannoPath row
"The Date_* columns are not consistent"
| not $ checkContamColsConsistent x = Left $ PoseidonFileRowException jannoPath row
"The Contamination_* columns are not consistent"
| not $ checkRelationColsConsistent x = Left $ PoseidonFileRowException jannoPath row
"The Relation_* columns are not consistent"
| otherwise = Right x

checkMandatoryStringNotEmpty :: JannoRow -> Bool
-- Row-wise janno consistency checks

type JannoRowWarnings = [String]
type JannoRowLog = E.ExceptT String (W.Writer JannoRowWarnings)
nevrome marked this conversation as resolved.
Show resolved Hide resolved

checkJannoRowConsistency :: JannoRow -> JannoRowLog JannoRow
checkJannoRowConsistency x =
return x
>>= checkMandatoryStringNotEmpty
>>= checkC14ColsConsistent
>>= checkContamColsConsistent
>>= checkRelationColsConsistent

checkMandatoryStringNotEmpty :: JannoRow -> JannoRowLog JannoRow
checkMandatoryStringNotEmpty x =
(not . null . jPoseidonID $ x)
&& (not . null . getJannoList . jGroupName $ x)
&& (not . null . head . getJannoList . jGroupName $ x)
let notEmpty = (not . null . jPoseidonID $ x) &&
(not . null . getJannoList . jGroupName $ x) &&
(not . null . head . getJannoList . jGroupName $ x)
in case notEmpty of
False -> E.throwError "Poseidon_ID or Group_Name are empty"

Check warning on line 1104 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1104

Added line #L1104 was not covered by tests
True -> return x

getCellLength :: Maybe (JannoList a) -> Int
getCellLength = maybe 0 (length . getJannoList)
Expand All @@ -1062,33 +1111,50 @@
allEqual [] = True
allEqual x = length (nub x) == 1

checkC14ColsConsistent :: JannoRow -> Bool
checkC14ColsConsistent :: JannoRow -> JannoRowLog JannoRow
checkC14ColsConsistent x =
let lLabnr = getCellLength $ jDateC14Labnr x
lUncalBP = getCellLength $ jDateC14UncalBP x
lUncalBPErr = getCellLength $ jDateC14UncalBPErr x
shouldBeTypeC14 = lUncalBP > 0
isTypeC14 = jDateType x == Just C14
in
(lLabnr == 0 || lUncalBP == 0 || lLabnr == lUncalBP)
&& (lLabnr == 0 || lUncalBPErr == 0 || lLabnr == lUncalBPErr)
&& lUncalBP == lUncalBPErr
&& (not shouldBeTypeC14 || isTypeC14)

checkContamColsConsistent :: JannoRow -> Bool
let isTypeC14 = jDateType x == Just C14
lLabnr = getCellLength $ jDateC14Labnr x
lUncalBP = getCellLength $ jDateC14UncalBP x
lUncalBPErr = getCellLength $ jDateC14UncalBPErr x
anyMainColFilled = lUncalBP > 0 || lUncalBPErr > 0
anyMainColEmpty = lUncalBP == 0 || lUncalBPErr == 0
allSameLength = allEqual [lLabnr, lUncalBP, lUncalBPErr] ||
(lLabnr == 0 && lUncalBP == lUncalBPErr)

Check warning on line 1123 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1123

Added line #L1123 was not covered by tests
in case (isTypeC14, anyMainColFilled, anyMainColEmpty, allSameLength) of
(False, False, _, _ ) -> return x
(False, True, _, _ ) -> E.throwError "Date_Type is not \"C14\", but either \
\Date_C14_Uncal_BP or Date_C14_Uncal_BP_Err are not empty"
(True, _, False, False) -> E.throwError "Date_C14_Labnr, Date_C14_Uncal_BP and Date_C14_Uncal_BP_Err \

Check warning on line 1128 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1128

Added line #L1128 was not covered by tests
\do not have the same lengths. Date_C14_Labnr can be empty"
nevrome marked this conversation as resolved.
Show resolved Hide resolved
(True, _, False, True ) -> return x
-- this should be an error, but we have legacy packages with this issue, so it's only a warning
(True, _, True, _ ) -> do
W.tell ["Date_Type is \"C14\", but either Date_C14_Uncal_BP or Date_C14_Uncal_BP_Err are empty"]
return x

Check warning on line 1134 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L1132-L1134

Added lines #L1132 - L1134 were not covered by tests

checkContamColsConsistent :: JannoRow -> JannoRowLog JannoRow
checkContamColsConsistent x =
let lContamination = getCellLength $ jContamination x
lContaminationErr = getCellLength $ jContaminationErr x
lContaminationMeas = getCellLength $ jContaminationMeas x
in allEqual [lContamination, lContaminationErr, lContaminationMeas]

checkRelationColsConsistent :: JannoRow -> Bool
let lContamination = getCellLength $ jContamination x
lContaminationErr = getCellLength $ jContaminationErr x
lContaminationMeas = getCellLength $ jContaminationMeas x
allSameLength = allEqual [lContamination, lContaminationErr, lContaminationMeas]
in case allSameLength of
False -> E.throwError "Contamination, Contamination_Err and Contamination_Meas \
\do not have the same lengths"
True -> return x

checkRelationColsConsistent :: JannoRow -> JannoRowLog JannoRow
checkRelationColsConsistent x =
let lRelationTo = getCellLength $ jRelationTo x
lRelationDegree = getCellLength $ jRelationDegree x
lRelationType = getCellLength $ jRelationType x
in allEqual [lRelationTo, lRelationType, lRelationDegree]
|| (allEqual [lRelationTo, lRelationDegree] && isNothing (jRelationType x))
let lRelationTo = getCellLength $ jRelationTo x
lRelationDegree = getCellLength $ jRelationDegree x
lRelationType = getCellLength $ jRelationType x
allSameLength = allEqual [lRelationTo, lRelationDegree, lRelationType] ||
(allEqual [lRelationTo, lRelationDegree] && lRelationType == 0)
in case allSameLength of
False -> E.throwError "Relation_To, Relation_Degree and Relation_Type \
\do not have the same lengths. Relation_Type can be empty"
True -> return x

-- deriving with TemplateHaskell necessary for the generics magic in the Survey module
deriveGeneric ''JannoRow
44 changes: 18 additions & 26 deletions src/Poseidon/SequencingSource.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
explicitNA, filterLookup,
filterLookupOptional, getCsvNR,
makeAccessionID,
removeUselessSuffix)
parseCsvParseError,
removeUselessSuffix,
renderCsvParseError)
import Poseidon.Utils (PoseidonException (..), PoseidonIO,
logDebug, logError, logWarning,
renderPoseidonException)
Expand Down Expand Up @@ -39,6 +41,7 @@
import qualified Data.Vector as V
import Data.Yaml.Aeson (FromJSON (..))
import GHC.Generics (Generic)
import qualified Text.Parsec as P

-- |A datatype to represent UDG in a ssf file
data SSFUDG =
Expand Down Expand Up @@ -400,36 +403,25 @@
mapM_ (logError . renderPoseidonException) $ take 5 $ lefts seqSourceRepresentation
liftIO $ throwIO $ PoseidonFileConsistencyException seqSourcePath "Broken lines."
else do
let consistentSeqSource = checkSeqSourceConsistency seqSourcePath $ SeqSourceRows $ rights seqSourceRepresentation
case consistentSeqSource of
Left e -> do liftIO $ throwIO (e :: PoseidonException)
Right x -> do
-- warnings
warnSeqSourceConsistency seqSourcePath x
-- finally return the good ones
return x
let seqSource = SeqSourceRows $ rights seqSourceRepresentation
warnSeqSourceConsistency seqSourcePath seqSource
return seqSource

-- | A function to read one row of a seqSourceFile
readSeqSourceFileRow :: FilePath -> (Int, Bch.ByteString) -> PoseidonIO (Either PoseidonException SeqSourceRow)
readSeqSourceFileRow seqSourcePath (lineNumber, row) = do
case Csv.decodeByNameWith decodingOptions row of
let decoded = Csv.decodeByNameWith decodingOptions row
simplifiedDecoded = (\(_,rs) -> V.head rs) <$> decoded
case simplifiedDecoded of
Left e -> do
return $ Left $ PoseidonFileRowException seqSourcePath lineNumber $ removeUselessSuffix e
Right (_, seqSourceRow :: V.Vector SeqSourceRow) -> do
case checkSeqSourceRowConsistency seqSourcePath lineNumber $ V.head seqSourceRow of
Left e -> do return $ Left e
Right (pS :: SeqSourceRow) -> do return $ Right pS

-- SeqSource consistency checks
checkSeqSourceConsistency :: FilePath -> SeqSourceRows -> Either PoseidonException SeqSourceRows
checkSeqSourceConsistency _ xs
-- | ... no tests implemented
| otherwise = Right xs

checkSeqSourceRowConsistency :: FilePath -> Int -> SeqSourceRow -> Either PoseidonException SeqSourceRow
checkSeqSourceRowConsistency _ _ x
-- | ... no tests implemented
| otherwise = Right x
let betterError = case P.parse parseCsvParseError "" e of
Left _ -> removeUselessSuffix e
Right result -> renderCsvParseError result
return $ Left $ PoseidonFileRowException seqSourcePath (show lineNumber) betterError

Check warning on line 420 in src/Poseidon/SequencingSource.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/SequencingSource.hs#L417-L420

Added lines #L417 - L420 were not covered by tests
Right seqSourceRow -> do
return $ Right seqSourceRow

-- Global SSF consistency checks

warnSeqSourceConsistency :: FilePath -> SeqSourceRows -> PoseidonIO ()
warnSeqSourceConsistency seqSourcePath xs = do
Expand Down
4 changes: 2 additions & 2 deletions src/Poseidon/Utils.hs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@
| PoseidonGenotypeException String -- ^ An exception to represent errors in the genotype data
| PoseidonGenotypeExceptionForward SomeException -- ^ An exception to represent errors in the genotype data forwarded from the sequence-formats library
| PoseidonHttpExceptionForward HttpException -- ^ An exception to represent errors in the remote data loading forwarded from simpleHttp
| PoseidonFileRowException FilePath Int String -- ^ An exception to represent errors when trying to parse the janno or seqSource file
| PoseidonFileRowException FilePath String String -- ^ An exception to represent errors when trying to parse the janno or seqSource file
| PoseidonFileConsistencyException FilePath String -- ^ An exception to represent consistency errors in janno or seqSource files
| PoseidonCrossFileConsistencyException String String -- ^ An exception to represent inconsistencies across multiple files in a package
| PoseidonCollectionException String -- ^ An exception to represent logical issues in a poseidon package Collection
Expand Down Expand Up @@ -204,7 +204,7 @@
renderPoseidonException (PoseidonHttpExceptionForward e) =
"Issues in HTTP-communication with server: " ++ show e
renderPoseidonException (PoseidonFileRowException f i s) =
"Can't read sample in " ++ f ++ " in line " ++ show i ++ ": " ++ s
"Can't read sample in " ++ f ++ " in line " ++ i ++ ": " ++ s

Check warning on line 207 in src/Poseidon/Utils.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Utils.hs#L207

Added line #L207 was not covered by tests
renderPoseidonException (PoseidonFileConsistencyException f s) =
"Consistency issues in file " ++ f ++ ": " ++ s
renderPoseidonException (PoseidonCrossFileConsistencyException p s) =
Expand Down
Loading