Skip to content

Commit

Permalink
continued
Browse files Browse the repository at this point in the history
  • Loading branch information
stschiff committed Dec 18, 2023
1 parent 6ee7de7 commit 7a8c331
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 5 deletions.
3 changes: 3 additions & 0 deletions src-executables/Main-trident.hs
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,6 @@ jannocoalesceOptParser = JannoCoalesceOptions <$> parseJannocoalSourceSpec
<*> parseJannocoalOutSpec
<*> parseJannocoalFillColumns
<*> parseJannocoalOverride
<*> parseJannocoalSourceKey
<*> parseJannocoalTargetKey
<*> parseJannocoalIdStripRegex
33 changes: 29 additions & 4 deletions src/Poseidon/CLI/Jannocoalesce.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@ import Poseidon.Package (PackageReadOptions (..),
import Poseidon.Utils (PoseidonException (..), PoseidonIO,
logInfo)

import Control.Monad (forM)
import Control.Monad (filterM, forM)
import Control.Monad.Catch (MonadThrow, throwM)
import Control.Monad.IO.Class (liftIO)
import qualified Data.ByteString.Char8 as BSC
import qualified Data.Csv as Csv
import qualified Data.HashMap.Strict as HM
import Data.List.Utils (replace)

Check failure on line 19 in src/Poseidon/CLI/Jannocoalesce.hs

View workflow job for this annotation

GitHub Actions / stack / ghc 9.4.7

Could not find module ‘Data.List.Utils’
import System.Directory (createDirectoryIfMissing)
import System.FilePath (takeDirectory)
import Text.Regex.TDFA (subRegex)

Check failure on line 22 in src/Poseidon/CLI/Jannocoalesce.hs

View workflow job for this annotation

GitHub Actions / stack / ghc 9.4.7

Module ‘Text.Regex.TDFA’ does not export ‘subRegex’

-- the source can be a single janno file, or a set of base directories as usual.
data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath]
Expand All @@ -28,10 +30,13 @@ data JannoCoalesceOptions = JannoCoalesceOptions
, _jannocoalesceOutSpec :: Maybe FilePath -- Nothing means "in place"
, _jannocoalesceFillColumns :: [String] -- empty list means All
, _jannocoalesceOverwriteColumns :: Bool
, _jannocoalesceSourceKey :: String -- by default set to "Poseidon_ID"
, _jannocoalesceTargetKey :: String -- by default set to "Poseidon_ID"
, _jannocoalesceIdStrip :: Maybe String -- an optional regex to strip from target and source keys
}

runJannocoalesce :: JannoCoalesceOptions -> PoseidonIO ()
runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrite) = do
runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrite sKey tKey maybeStrip) = do
JannoRows sourceRows <- case sourceSpec of
JannoSourceSingle sourceFile -> readJannoFile sourceFile
JannoSourceBaseDirs sourceDirs -> do
Expand All @@ -44,8 +49,8 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit
getJointJanno <$> readPoseidonPackageCollection pacReadOpts sourceDirs
JannoRows targetRows <- readJannoFile target
newJanno <- forM targetRows $ \targetRow -> do
let posId = jPoseidonID targetRow
sourceRowCandidates = filter (\r -> jPoseidonID r == posId) sourceRows
posId <- getKeyFromJanno targetRow tKey
sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows
case sourceRowCandidates of
[] -> return targetRow
[keyRow] -> mergeRow targetRow keyRow fields overwrite
Expand All @@ -56,6 +61,26 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit
createDirectoryIfMissing True (takeDirectory outPath)
writeJannoFile outPath (JannoRows newJanno)

getKeyFromJanno :: (MonadThrow m) => JannoRow -> String -> m String
getKeyFromJanno jannoRow key = do
let jannoRowDict = Csv.toNamedRecord jannoRow
case jannoRowDict HM.!? (BSC.pack key) of
Nothing -> throwM $ PoseidonGenericException ("Key " ++ key ++ " not present in janno file")
Just r -> return $ BSC.unpack r

matchWithOptionalStrip :: (Maybe String) -> String -> String -> Bool
matchWithOptionalStrip maybeRegex id1 id2 =
case maybeRegex of
Nothing -> id1 == id2
Just r ->
let id1stripped = stripR (mkRegex r) id1 ""
id2stripped = stripR (mkRegex r) id2 ""
in id1stripped == id2stripped
where
stripR r s =
let match = s =~ r
in replace s match ""

mergeRow :: (MonadThrow m) => JannoRow -> JannoRow -> [String] -> Bool -> m JannoRow
mergeRow targetRow sourceRow fields overwrite = do
let targetRowRecord = Csv.toNamedRecord targetRow
Expand Down
11 changes: 11 additions & 0 deletions src/Poseidon/CLI/OptparseApplicativeParsers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -807,3 +807,14 @@ parseJannocoalOverride = OP.switch (
OP.short 'f' <>
OP.help "With this option, potential non-missing content in target columns gets overridden with non-missing content in source columns. By default, only missing data gets filled-in."
)

parseJannocoalSourceKey :: OP.Parser String
parseJannocoalSourceKey = OP.strOption (OP.long "sourceKey" <> OP.help "the janno column to use as the source key" <> OP.value "Poseidon_ID" <> OP.showDefault)

parseJannocoalTargetKey :: OP.Parser String
parseJannocoalTargetKey = OP.strOption (OP.long "targetKey" <> OP.help "the janno column to use as the target key" <> OP.value "Poseidon_ID" <> OP.showDefault)

parseJannocoalIdStripRegex :: OP.Parser (Maybe String)
parseJannocoalIdStripRegex = OP.option (Just <$> OP.str) (OP.long "stripIdSuffix" <>
OP.help "an optional regular expression to identify parts of the IDs to strip before matching between source and target" <> OP.value Nothing)

5 changes: 4 additions & 1 deletion test/PoseidonGoldenTests/GoldenTestsRunCommands.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,10 @@ testPipelineJannocoalesce testDir checkFilePath = do
_jannocoalesceTarget = "test/testDat/testJannoFiles/normal_subset.janno",
_jannocoalesceOutSpec = Just (testDir </> "jannocoalesce" </> "targetNoFieldsNoOverride.janno"),
_jannocoalesceFillColumns = [],
_jannocoalesceOverwriteColumns = False
_jannocoalesceOverwriteColumns = False,
_jannocoalesceSourceKey = "Poseidon_ID",
_jannocoalesceTargetKey = "Poseidon_ID",
_jannocoalesceIdStripRegex = Nothing
}
runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts1) "jannocoalesce" [
"jannocoalesce" </> "targetNoFieldsNoOverride.janno"
Expand Down

0 comments on commit 7a8c331

Please sign in to comment.