diff --git a/src-executables/Main-trident.hs b/src-executables/Main-trident.hs index 0727acc7..81c33f47 100644 --- a/src-executables/Main-trident.hs +++ b/src-executables/Main-trident.hs @@ -275,3 +275,6 @@ jannocoalesceOptParser = JannoCoalesceOptions <$> parseJannocoalSourceSpec <*> parseJannocoalOutSpec <*> parseJannocoalFillColumns <*> parseJannocoalOverride + <*> parseJannocoalSourceKey + <*> parseJannocoalTargetKey + <*> parseJannocoalIdStripRegex diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 9fcfb7b0..55a454ad 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -10,14 +10,16 @@ import Poseidon.Package (PackageReadOptions (..), import Poseidon.Utils (PoseidonException (..), PoseidonIO, logInfo) -import Control.Monad (forM) +import Control.Monad (filterM, forM) import Control.Monad.Catch (MonadThrow, throwM) import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BSC import qualified Data.Csv as Csv import qualified Data.HashMap.Strict as HM +import Data.List.Utils (replace) import System.Directory (createDirectoryIfMissing) import System.FilePath (takeDirectory) +import Text.Regex.TDFA (subRegex) -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] @@ -28,10 +30,13 @@ data JannoCoalesceOptions = JannoCoalesceOptions , _jannocoalesceOutSpec :: Maybe FilePath -- Nothing means "in place" , _jannocoalesceFillColumns :: [String] -- empty list means All , _jannocoalesceOverwriteColumns :: Bool + , _jannocoalesceSourceKey :: String -- by default set to "Poseidon_ID" + , _jannocoalesceTargetKey :: String -- by default set to "Poseidon_ID" + , _jannocoalesceIdStrip :: Maybe String -- an optional regex to strip from target and source keys } runJannocoalesce :: JannoCoalesceOptions -> PoseidonIO () -runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrite) = do +runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrite sKey tKey maybeStrip) = do JannoRows sourceRows <- case sourceSpec of JannoSourceSingle sourceFile -> readJannoFile sourceFile JannoSourceBaseDirs sourceDirs -> do @@ -44,8 +49,8 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit getJointJanno <$> readPoseidonPackageCollection pacReadOpts sourceDirs JannoRows targetRows <- readJannoFile target newJanno <- forM targetRows $ \targetRow -> do - let posId = jPoseidonID targetRow - sourceRowCandidates = filter (\r -> jPoseidonID r == posId) sourceRows + posId <- getKeyFromJanno targetRow tKey + sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows case sourceRowCandidates of [] -> return targetRow [keyRow] -> mergeRow targetRow keyRow fields overwrite @@ -56,6 +61,26 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit createDirectoryIfMissing True (takeDirectory outPath) writeJannoFile outPath (JannoRows newJanno) +getKeyFromJanno :: (MonadThrow m) => JannoRow -> String -> m String +getKeyFromJanno jannoRow key = do + let jannoRowDict = Csv.toNamedRecord jannoRow + case jannoRowDict HM.!? (BSC.pack key) of + Nothing -> throwM $ PoseidonGenericException ("Key " ++ key ++ " not present in janno file") + Just r -> return $ BSC.unpack r + +matchWithOptionalStrip :: (Maybe String) -> String -> String -> Bool +matchWithOptionalStrip maybeRegex id1 id2 = + case maybeRegex of + Nothing -> id1 == id2 + Just r -> + let id1stripped = stripR (mkRegex r) id1 "" + id2stripped = stripR (mkRegex r) id2 "" + in id1stripped == id2stripped + where + stripR r s = + let match = s =~ r + in replace s match "" + mergeRow :: (MonadThrow m) => JannoRow -> JannoRow -> [String] -> Bool -> m JannoRow mergeRow targetRow sourceRow fields overwrite = do let targetRowRecord = Csv.toNamedRecord targetRow diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index df1e5823..947621ec 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -807,3 +807,14 @@ parseJannocoalOverride = OP.switch ( OP.short 'f' <> OP.help "With this option, potential non-missing content in target columns gets overridden with non-missing content in source columns. By default, only missing data gets filled-in." ) + +parseJannocoalSourceKey :: OP.Parser String +parseJannocoalSourceKey = OP.strOption (OP.long "sourceKey" <> OP.help "the janno column to use as the source key" <> OP.value "Poseidon_ID" <> OP.showDefault) + +parseJannocoalTargetKey :: OP.Parser String +parseJannocoalTargetKey = OP.strOption (OP.long "targetKey" <> OP.help "the janno column to use as the target key" <> OP.value "Poseidon_ID" <> OP.showDefault) + +parseJannocoalIdStripRegex :: OP.Parser (Maybe String) +parseJannocoalIdStripRegex = OP.option (Just <$> OP.str) (OP.long "stripIdSuffix" <> + OP.help "an optional regular expression to identify parts of the IDs to strip before matching between source and target" <> OP.value Nothing) + diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index b10657e4..62b161be 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -1091,7 +1091,10 @@ testPipelineJannocoalesce testDir checkFilePath = do _jannocoalesceTarget = "test/testDat/testJannoFiles/normal_subset.janno", _jannocoalesceOutSpec = Just (testDir "jannocoalesce" "targetNoFieldsNoOverride.janno"), _jannocoalesceFillColumns = [], - _jannocoalesceOverwriteColumns = False + _jannocoalesceOverwriteColumns = False, + _jannocoalesceSourceKey = "Poseidon_ID", + _jannocoalesceTargetKey = "Poseidon_ID", + _jannocoalesceIdStripRegex = Nothing } runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts1) "jannocoalesce" [ "jannocoalesce" "targetNoFieldsNoOverride.janno"