Skip to content

Commit

Permalink
Merge pull request #318 from poseidon-framework/gzip-writing-support
Browse files Browse the repository at this point in the history
  • Loading branch information
stschiff authored Dec 19, 2024
2 parents 6e1b382 + 67872ba commit d918c11
Show file tree
Hide file tree
Showing 17 changed files with 267 additions and 120 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
- V 1.6.0.0:
- Added support to write gzipped EIGENSTRAT and PLINK files with `genoconvert` and `forge`. Both commands get a new option `-z` which creates gzipped output.
- V 1.5.7.4:
- Fixed a bug that broke the long-form genotype data input option (with `--genoFile + --snpFile + ...`).
- V 1.5.7.3:
Expand Down
2 changes: 1 addition & 1 deletion poseidon-hs.cabal
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: poseidon-hs
version: 1.5.7.4
version: 1.6.0.0
synopsis: A package with tools for working with Poseidon genotype data
description: The tools in this package read and analyse Poseidon-formatted genotype databases, a modular system for storing genotype data from thousands of individuals.
license: MIT
Expand Down
3 changes: 2 additions & 1 deletion src-executables/Main-trident.hs
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ forgeOptParser = ForgeOptions <$> parseGenoDataSources
<*> parseIntersect
<*> parseOutGenotypeFormat True
<*> parseForgeOutMode
<*> parseZipOut
<*> parseOutPackagePath
<*> parseMaybeOutPackageName
<*> parsePackageWise
Expand All @@ -220,11 +221,11 @@ forgeOptParser = ForgeOptions <$> parseGenoDataSources
genoconvertOptParser :: OP.Parser GenoconvertOptions
genoconvertOptParser = GenoconvertOptions <$> parseGenoDataSources
<*> parseOutGenotypeFormat False
<*> parseOutOnlyGenoSwitch
<*> parseMaybeOutPackagePath
<*> parseRemoveOld
<*> parseOutputPlinkPopMode
<*> parseOnlyLatest
<*> parseZipOut

summariseOptParser :: OP.Parser SummariseOptions
summariseOptParser = SummariseOptions <$> parseBasePaths
Expand Down
14 changes: 8 additions & 6 deletions src/Poseidon/CLI/Forge.hs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ data ForgeOptions = ForgeOptions
, _forgeIntersect :: Bool
, _forgeOutFormat :: String
, _forgeOutMode :: ForgeOutMode
, _forgeOutZip :: Bool
, _forgeOutPacPath :: FilePath
, _forgeOutPacName :: Maybe String
, _forgePackageWise :: Bool
Expand Down Expand Up @@ -108,7 +109,7 @@ runForge :: ForgeOptions -> PoseidonIO ()
runForge (
ForgeOptions genoSources
entityInputs maybeSnpFile intersect_
outFormat outMode outPathRaw maybeOutName
outFormat outMode outZip outPathRaw maybeOutName
packageWise outPlinkPopMode
outputOrdered
) = do
Expand Down Expand Up @@ -184,14 +185,15 @@ runForge (
Nothing -> snpSetMergeList snpSetList intersect_
Just _ -> SNPSetOther
-- compile genotype data structure
let gz = if outZip then "gz" else ""
genotypeFileData <- case outFormat of
"EIGENSTRAT" -> return $
GenotypeEigenstrat (outName <.> ".geno") Nothing
(outName <.> ".snp") Nothing
GenotypeEigenstrat (outName <.> ".geno" <.> gz) Nothing
(outName <.> ".snp" <.> gz) Nothing
(outName <.> ".ind") Nothing
"PLINK" -> return $
GenotypePlink (outName <.> ".bed") Nothing
(outName <.> ".bim") Nothing
GenotypePlink (outName <.> ".bed" <.> gz) Nothing
(outName <.> ".bim" <.> gz) Nothing
(outName <.> ".fam") Nothing
_ -> liftIO . throwIO $
PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment")
Expand Down Expand Up @@ -270,7 +272,7 @@ runForge (
let fullSourcePath = posPacBaseDir pacSource </> path
liftIO $ checkFile fullSourcePath Nothing
liftIO $ copyFile fullSourcePath $ outPath </> path
compileGenotypeData :: FilePath -> GenotypeFileSpec -> [PoseidonPackage] -> [Int] -> PoseidonIO (VUM.IOVector Int)
compileGenotypeData :: FilePath -> GenotypeFileSpec -> [PoseidonPackage] -> [Int] -> PoseidonIO (VUM.IOVector Int)
compileGenotypeData outPath gFileSpec relevantPackages relevantIndices = do
logInfo "Compiling genotype data"
logInfo "Processing SNPs..."
Expand Down
169 changes: 105 additions & 64 deletions src/Poseidon/CLI/Genoconvert.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module Poseidon.CLI.Genoconvert where
import Poseidon.EntityTypes (HasNameAndVersion (..))
import Poseidon.GenotypeData (GenoDataSource (..),
GenotypeDataSpec (..),
GenotypeFileSpec (..), getFormat,
GenotypeFileSpec (..),
loadGenotypeData,
printSNPCopyProgress)
import Poseidon.Janno (jannoRows2EigenstratIndEntries)
Expand All @@ -17,10 +17,11 @@ import Poseidon.Package (PackageReadOptions (..),
writePoseidonPackage)
import Poseidon.Utils (PoseidonException (..), PoseidonIO,
envErrorLength, envLogAction,
logInfo, logWarning)
logError, logInfo, logWarning)

import Control.Exception (catch, throwIO)
import Control.Monad (unless, when)
import Data.List ((\\))
import Data.Maybe (isJust)
import Data.Time (getCurrentTime)
import Pipes (MonadIO (liftIO), runEffect, (>->))
Expand All @@ -29,102 +30,142 @@ import SequenceFormats.Eigenstrat (writeEigenstrat)
import SequenceFormats.Plink (PlinkPopNameMode,
eigenstratInd2PlinkFam, writePlink)
import System.Directory (createDirectoryIfMissing,
doesFileExist, removeFile)
doesFileExist, removeFile,
renameFile)
import System.Exit (ExitCode (..), exitWith)
import System.FilePath (dropTrailingPathSeparator, (<.>),
(</>))

-- | A datatype representing command line options for the validate command
data GenoconvertOptions = GenoconvertOptions
{ _genoconvertGenoSources :: [GenoDataSource]
, _genoConvertOutFormat :: String
, _genoConvertOutOnlyGeno :: Bool
, _genoMaybeOutPackagePath :: Maybe FilePath
, _genoconvertRemoveOld :: Bool
, _genoconvertOutPlinkPopMode :: PlinkPopNameMode
, _genoconvertOnlyLatest :: Bool
, _genoconvertOutZip :: Bool
}

runGenoconvert :: GenoconvertOptions -> PoseidonIO ()
runGenoconvert (GenoconvertOptions genoSources outFormat onlyGeno outPath removeOld outPlinkPopMode onlyLatest) = do
runGenoconvert (GenoconvertOptions genoSources outFormat outPath
removeOld outPlinkPopMode onlyLatest outZip) = do
let pacReadOpts = defaultPackageReadOptions {
_readOptIgnoreChecksums = True
, _readOptIgnoreGeno = False
, _readOptGenoCheck = True
, _readOptOnlyLatest = onlyLatest
}
-- load packages
properPackages <- readPoseidonPackageCollection pacReadOpts $ [getPacBaseDir x | x@PacBaseDir {} <- genoSources]
pseudoPackages <- mapM makePseudoPackageFromGenotypeData [getGenoDirect x | x@GenoDirect {} <- genoSources]
properPackages <- readPoseidonPackageCollection pacReadOpts $
[getPacBaseDir x | x@PacBaseDir {} <- genoSources]
pseudoPackages <- mapM makePseudoPackageFromGenotypeData
[getGenoDirect x | x@GenoDirect {} <- genoSources]

logInfo $ "Unpackaged genotype data files loaded: " ++ show (length pseudoPackages)
-- convert
mapM_ (convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode) properPackages
mapM_ (convertGenoTo outFormat True outPath removeOld outPlinkPopMode) pseudoPackages
mapM_ (convertGenoTo outFormat False outPath removeOld outPlinkPopMode outZip) properPackages
mapM_ (convertGenoTo outFormat True outPath removeOld outPlinkPopMode outZip) pseudoPackages

convertGenoTo :: String -> Bool -> Maybe FilePath -> Bool ->
PlinkPopNameMode -> PoseidonPackage -> PoseidonIO ()
convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode pac = do
PlinkPopNameMode -> Bool -> PoseidonPackage -> PoseidonIO ()
convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode outZip pac = do
-- start message
logInfo $
"Converting genotype data in "
++ show (posPacNameAndVersion pac)
++ " to format "
++ show outFormat
++ ":"
-- compile file names paths
++ if outZip then " (gzipped):" else ":"

-- compile new relative file names
let outName = getPacName . posPacNameAndVersion $ pac
(outInd, outSnp, outGeno) <- case outFormat of
"EIGENSTRAT" -> return (outName <.> ".ind", outName <.> ".snp", outName <.> ".geno")
"PLINK" -> return (outName <.> ".fam", outName <.> ".bim", outName <.> ".bed")
_ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment")
-- check if genotype data needs conversion
if getFormat (genotypeFileSpec (posPacGenotypeData pac)) == outFormat
then logWarning "The genotype data is already in the requested format"
let gz = if outZip then "gz" else ""
(outIrel, outSrel, outGrel) <- case outFormat of
"EIGENSTRAT" -> return
(outName <.> ".ind", outName <.> ".snp" <.> gz, outName <.> ".geno" <.> gz)
"PLINK" -> return
(outName <.> ".fam", outName <.> ".bim" <.> gz, outName <.> ".bed" <.> gz)
_ -> liftIO . throwIO . PoseidonGenericException $
"Illegal outFormat " ++ outFormat ++
". Only Outformats EIGENSTRAT or PLINK are allowed at the moment"

-- compile new absolute genotype file names
newBaseDir <- case outPath of
Just x -> do
-- create new directory
logInfo $ "Writing to directory (will be created if missing): " ++ x
liftIO $ createDirectoryIfMissing True (dropTrailingPathSeparator x)
return x
Nothing -> return $ posPacBaseDir pac
let (outGabs, outSabs, outIabs) = (newBaseDir </> outGrel, newBaseDir </> outSrel, newBaseDir </> outIrel)

-- check whether anything needs doing at all
allExists <- and <$> mapM checkFile [outGabs, outSabs, outIabs]
if allExists
then do
if onlyGeno
then do
logError $ "No files were created or overwritten for " ++ show (posPacNameAndVersion pac)
liftIO $ exitWith (ExitFailure 1)
else
logWarning $ "Package already in desired file-type, skipping genotype conversion for " ++
show (posPacNameAndVersion pac)
else do
-- create new genotype data files
newBaseDir <- case outPath of
Just x -> do
-- create new directory
logInfo $ "Writing to directory (will be created if missing): " ++ x
liftIO $ createDirectoryIfMissing True (dropTrailingPathSeparator x)
return x
Nothing -> return $ posPacBaseDir pac
let (outG, outS, outI) = (newBaseDir </> outGeno, newBaseDir </> outSnp, newBaseDir </> outInd)
anyExists <- or <$> mapM checkFile [outG, outS, outI]
if anyExists
then logWarning ("skipping genotype conversion for " ++ show (posPacNameAndVersion pac))
else do
logInfo "Processing SNPs..."
logA <- envLogAction
currentTime <- liftIO getCurrentTime
errLength <- envErrorLength
let eigenstratIndEntries = jannoRows2EigenstratIndEntries . posPacJanno $ pac
liftIO $ catch (
runSafeT $ do
eigenstratProd <- loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac)
let outConsumer = case outFormat of
"EIGENSTRAT" -> writeEigenstrat outG outS outI eigenstratIndEntries
"PLINK" -> writePlink outG outS outI (map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries)
_ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment")
runEffect $ eigenstratProd >-> printSNPCopyProgress logA currentTime >-> outConsumer
) (throwIO . PoseidonGenotypeExceptionForward errLength)
logInfo "Done"
-- overwrite genotype data field in POSEIDON.yml file
unless (onlyGeno || isJust outPath) $ do
gFileSpec <- case outFormat of
"EIGENSTRAT" -> return $ GenotypeEigenstrat outGeno Nothing outSnp Nothing outInd Nothing
"PLINK" -> return $ GenotypePlink outGeno Nothing outSnp Nothing outInd Nothing
_ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment")
let genotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac)
newPac = pac { posPacGenotypeData = genotypeData }
logInfo "Adjusting POSEIDON.yml..."
liftIO $ writePoseidonPackage newPac
-- delete now replaced input genotype data
let filesToDelete = case genotypeFileSpec . posPacGenotypeData $ pac of
GenotypeEigenstrat g _ s _ i _ -> [g, s, i]
GenotypePlink g _ s _ i _ -> [g, s, i]
GenotypeVCF g _ -> [g]
when removeOld . liftIO . mapM_ (removeFile . (posPacBaseDir pac </>)) $ filesToDelete
-- Convert!
logInfo "Processing SNPs..."
logA <- envLogAction
currentTime <- liftIO getCurrentTime
errLength <- envErrorLength
let eigenstratIndEntries = jannoRows2EigenstratIndEntries . posPacJanno $ pac
let zipEnding = if outZip then ".gz" else "" -- we need this to trigger the zipping
liftIO $ catch (
runSafeT $ do
eigenstratProd <- loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac)
let outConsumer = case outFormat of
"EIGENSTRAT" -> writeEigenstrat (outGabs ++ ".gconvert" ++ zipEnding)
(outSabs ++ ".gconvert" ++ zipEnding)
(outIabs ++ ".gconvert")
eigenstratIndEntries
"PLINK" -> writePlink (outGabs ++ ".gconvert" ++ zipEnding)
(outSabs ++ ".gconvert" ++ zipEnding)
(outIabs ++ ".gconvert")
(map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries)
_ -> liftIO . throwIO . PoseidonGenericException $
"Illegal outFormat " ++ outFormat ++
". Only Outformats EIGENSTRAT or PLINK are allowed at the moment"
runEffect $ eigenstratProd >-> printSNPCopyProgress logA currentTime >-> outConsumer
) (throwIO . PoseidonGenotypeExceptionForward errLength)
-- the following will just overwrite if the file already exists, which is OK
liftIO $ renameFile (outGabs ++ ".gconvert" ++ zipEnding) outGabs
liftIO $ renameFile (outSabs ++ ".gconvert" ++ zipEnding) outSabs
liftIO $ renameFile (outIabs ++ ".gconvert") outIabs
logInfo "Done"

-- overwrite genotype data field in POSEIDON.yml file (using relative paths)
unless (onlyGeno || isJust outPath) $ do
gFileSpec <- case outFormat of
"EIGENSTRAT" -> return $
GenotypeEigenstrat outGrel Nothing outSrel Nothing outIrel Nothing
"PLINK" -> return $
GenotypePlink outGrel Nothing outSrel Nothing outIrel Nothing
_ -> liftIO . throwIO . PoseidonGenericException $
"Illegal outFormat " ++ outFormat ++
". Only Outformats EIGENSTRAT or PLINK are allowed at the moment"
let newGenotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac)
newPac = pac { posPacGenotypeData = newGenotypeData }
logInfo $ "Adjusting POSEIDON.yml for " ++ show (posPacNameAndVersion pac)
liftIO $ writePoseidonPackage newPac
-- delete now replaced input genotype data
when removeOld $ do
let oldBaseDir = posPacBaseDir pac
oldGenoFiles <- case genotypeFileSpec . posPacGenotypeData $ pac of
GenotypeEigenstrat g _ s _ i _ -> return [oldBaseDir </> g, oldBaseDir </> s, oldBaseDir </> i]
GenotypePlink g _ s _ i _ -> return [oldBaseDir </> g, oldBaseDir </> s, oldBaseDir </> i]
GenotypeVCF g _ -> return [oldBaseDir </> g]
let newGenoFiles = [outGabs, outSabs, outIabs]
let filesToDelete = oldGenoFiles \\ newGenoFiles
liftIO . mapM_ removeFile $ filesToDelete
where
checkFile :: FilePath -> PoseidonIO Bool
checkFile fn = do
Expand Down
Loading

0 comments on commit d918c11

Please sign in to comment.