diff --git a/.gitignore b/.gitignore index 73e9d059..c2c6f248 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ docs/_build/ .stack-work/ -dist-newstyle/ \ No newline at end of file +dist-newstyle/ +.DS_Store \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index dcd64e48..856500f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +- V 1.5.7.0: + - Added support for VCF files (Variant Call Format) in Janno-packages. + - restructured test package structure, affecting some of the unit- and golden tests. - V 1.5.6.0: - Introduced individual `Janno...` types for every .janno column (except Poseidon_ID) in a new module `ColumnTypes`. This was done to improve .janno validation error messages. - Defined a typeclass `Makeable` with a function `make` to write smart constructors for the column types. diff --git a/CHANGELOGRELEASE.md b/CHANGELOGRELEASE.md index 9c568a76..daca83e9 100644 --- a/CHANGELOGRELEASE.md +++ b/CHANGELOGRELEASE.md @@ -1,6 +1,6 @@ -### V 1.5.6.0 +### V 1.5.7.0 -This release further improves `.janno` parsing error messages and adds reading support for gzipped PLINK (`.bed` and `.bim`) and EIGENSTRAT (`.geno` and `.snp`) files. +This release further improves `.janno` parsing error messages and adds reading support for gzipped PLINK (`.bed` and `.bim`) and EIGENSTRAT (`.geno` and `.snp`) files. We also added (experimental) support for reading VCF files. #### Better .janno error messages @@ -26,8 +26,32 @@ The error messages now include the relevant column name and are more concrete an #### Reading support for gzipped genotype data -... +Although not yet part of the Poseidon 2.7.1 standard, Poseidon packages can now contain gzipped genotype files. Specifically, for EIGENSTRAT-formatted genotype data, the genotype matrix file (`.geno`) and the snp-list file (`.snp`) can now also be zipped. This strictly requires file endings with `.gz`, so `.geno.gz` and `.snp.gz`, respectively. Similarly, for PLINK-formatted genotype data, we now also accept `.bed.gz` and `.bim.gz`. Any such files with the `gz` file ending are assumed to be gzipped, and are decoded on the fly using stream-processing. Gzipped and unzipped files can also be mixed within the same package. +For commands that support the `--genoOne` option (`init`, `forge` and `genoconvert`), note that we make some assumptions, which are summarised in the help text for the option: + +``` + -p,--genoOne FILE One of the input genotype data files. Expects .bed, + .bed.gz, .bim, .bim.gz or .fam for PLINK, or .geno, + .geno.gz, .snp, .snp.gz or .ind for EIGENSTRAT. The + other files must be in the same directory and must + have the same base name. If a gzipped file is given, + it is assumed that the file pairs (.geno.gz, .snp.gz) + or (.bim.gz, .bed.gz) are both zipped, but not the + .fam or .ind file. If a .ind or .fam file is given, + it is assumed that none of the file triples is + zipped. For VCF please see option --vcfFile +``` + +At this point, `genoconvert` and `forge` do _not_ support writing of gzipped files. This will be added in the future. + +#### VCF support for genotype data + +Although not yet part of the Poseidon 2.7.1 standard, Poseidon packages can now contain VCF (Variant Call Format) files as genotype data, optionally gzipped. In contrast to EIGENSTRAT and PLINK format, which require triples of files, the VCF format requires just one file with ending `.vcf` or `.vcf.gz`. VCF files contain sample names, but no information about genetic sex or group names. This information is usually provided in `.janno` files, so there is no loss of information in Poseidon packages. For `trident init`, which constructs a minimal `.janno` file from the genotypem file, we set the `Genetic_Sex` column to "U", and the `Group_Name` column to "unknown". + +The VCF file format is very flexible and can encode a large amount of information (see https://samtools.github.io/hts-specs/VCFv4.2.pdf). We do not consider our parsing of VCF files to be complete. The feature is for now experimental, since future users may encounter valid VCF files that cause parsing errors in edge cases. Do not hesitate to file an issue in such a case: https://github.com/poseidon-framework/poseidon-hs/issues. + +At this point, `genoconvert` and `forge` do _not_ support writing of VCF files. This will be added in the future. ### V 1.5.4.0 diff --git a/poseidon-hs.cabal b/poseidon-hs.cabal index b36cb36e..d7e0d08b 100644 --- a/poseidon-hs.cabal +++ b/poseidon-hs.cabal @@ -1,5 +1,5 @@ name: poseidon-hs -version: 1.5.6.0 +version: 1.5.7.0 synopsis: A package with tools for working with Poseidon genotype data description: The tools in this package read and analyse Poseidon-formatted genotype databases, a modular system for storing genotype data from thousands of individuals. license: MIT diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 44c0e883..1ee50760 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -15,13 +15,14 @@ import Poseidon.EntityTypes (EntityInput, resolveUniqueEntityIndices) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), - GenotypeFormatSpec (..), + GenotypeFileSpec (..), SNPSetSpec (..), printSNPCopyProgress, selectIndices, snpSetMergeList) import Poseidon.Janno (JannoRow (..), JannoRows (..), ListColumn (..), getMaybeListColumn, + jannoRows2EigenstratIndEntries, writeJannoFile) import Poseidon.Package (PackageReadOptions (..), PoseidonPackage (..), @@ -41,9 +42,8 @@ import Poseidon.SequencingSource (SeqSourceRow (..), import Poseidon.Utils (PoseidonException (..), PoseidonIO, checkFile, determinePackageOutName, - envErrorLength, envInputPlinkMode, - envLogAction, logInfo, logWarning, - uniqueRO) + envErrorLength, envLogAction, + logInfo, logWarning, uniqueRO) import Control.Exception (catch, throwIO) import Control.Monad (filterM, forM, forM_, unless, @@ -76,7 +76,7 @@ data ForgeOptions = ForgeOptions , _forgeEntityInput :: [EntityInput SignedEntity] -- Empty list = forge all packages , _forgeSnpFile :: Maybe FilePath , _forgeIntersect :: Bool - , _forgeOutFormat :: GenotypeFormatSpec + , _forgeOutFormat :: String , _forgeOutMode :: ForgeOutMode , _forgeOutPacPath :: FilePath , _forgeOutPacName :: Maybe String @@ -114,7 +114,7 @@ runForge ( ) = do -- load packages -- - properPackages <- readPoseidonPackageCollection pacReadOpts $ [getPacBaseDirs x | x@PacBaseDir {} <- genoSources] + properPackages <- readPoseidonPackageCollection pacReadOpts $ [getPacBaseDir x | x@PacBaseDir {} <- genoSources] pseudoPackages <- mapM makePseudoPackageFromGenotypeData [getGenoDirect x | x@GenoDirect {} <- genoSources] logInfo $ "Unpackaged genotype data files loaded: " ++ show (length pseudoPackages) let allPackages = properPackages ++ pseudoPackages @@ -177,29 +177,37 @@ runForge ( -- create new directory logInfo $ "Writing to directory (will be created if missing): " ++ outPath liftIO $ createDirectoryIfMissing True outPath - -- compile genotype data structure - let (outInd, outSnp, outGeno) = case outFormat of - GenotypeFormatEigenstrat -> (outName <.> ".ind", outName <.> ".snp", outName <.> ".geno") - GenotypeFormatPlink -> (outName <.> ".fam", outName <.> ".bim", outName <.> ".bed") -- output warning if any snpSet is set to Other snpSetList <- fillMissingSnpSets relevantPackages let newSNPSet = case maybeSnpFile of Nothing -> snpSetMergeList snpSetList intersect_ Just _ -> SNPSetOther - let genotypeData = GenotypeDataSpec outFormat outGeno Nothing outSnp Nothing outInd Nothing (Just newSNPSet) + -- compile genotype data structure + genotypeFileData <- case outFormat of + "EIGENSTRAT" -> return $ + GenotypeEigenstrat (outName <.> ".geno") Nothing + (outName <.> ".snp") Nothing + (outName <.> ".ind") Nothing + "PLINK" -> return $ + GenotypePlink (outName <.> ".bed") Nothing + (outName <.> ".bim") Nothing + (outName <.> ".fam") Nothing + _ -> liftIO . throwIO $ + PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment") + let genotypeData = GenotypeDataSpec genotypeFileData (Just newSNPSet) -- assemble and write result depending on outMode -- logInfo "Creating new package entity" let pacSource = head relevantPackages case outMode of GenoOut -> do - _ <- compileGenotypeData outPath (outInd, outSnp, outGeno) relevantPackages relevantIndices + _ <- compileGenotypeData outPath genotypeFileData relevantPackages relevantIndices return () MinimalOut -> do - let pac = newMinimalPackageTemplate outPath outName genotypeData + pac <- newMinimalPackageTemplate outPath outName genotypeData writePoseidonYmlFile pac - _ <- compileGenotypeData outPath (outInd, outSnp, outGeno) relevantPackages relevantIndices + _ <- compileGenotypeData outPath genotypeFileData relevantPackages relevantIndices return () PreservePymlOut -> do normalPac <- newPackageTemplate outPath outName genotypeData @@ -217,7 +225,7 @@ runForge ( writeBibFile outPath outName relevantBibEntries copyREADMEFile outPath pacSource copyCHANGELOGFile outPath pacSource - newNrSnps <- compileGenotypeData outPath (outInd, outSnp, outGeno) relevantPackages relevantIndices + newNrSnps <- compileGenotypeData outPath genotypeFileData relevantPackages relevantIndices writingJannoFile outPath outName newNrSnps relevantJannoRows NormalOut -> do pac <- newPackageTemplate outPath outName genotypeData @@ -225,7 +233,7 @@ runForge ( writePoseidonYmlFile pac writeSSFile outPath outName relevantSeqSourceRows writeBibFile outPath outName relevantBibEntries - newNrSnps <- compileGenotypeData outPath (outInd, outSnp, outGeno) relevantPackages relevantIndices + newNrSnps <- compileGenotypeData outPath genotypeFileData relevantPackages relevantIndices writingJannoFile outPath outName newNrSnps relevantJannoRows where @@ -262,22 +270,25 @@ runForge ( let fullSourcePath = posPacBaseDir pacSource path liftIO $ checkFile fullSourcePath Nothing liftIO $ copyFile fullSourcePath $ outPath path - compileGenotypeData :: FilePath -> (String,String,String) -> [PoseidonPackage] -> [Int] -> PoseidonIO (VUM.IOVector Int) - compileGenotypeData outPath (outInd, outSnp, outGeno) relevantPackages relevantIndices = do + compileGenotypeData :: FilePath -> GenotypeFileSpec -> [PoseidonPackage] -> [Int] -> PoseidonIO (VUM.IOVector Int) + compileGenotypeData outPath gFileSpec relevantPackages relevantIndices = do logInfo "Compiling genotype data" logInfo "Processing SNPs..." logA <- envLogAction - inPlinkPopMode <- envInputPlinkMode currentTime <- liftIO getCurrentTime errLength <- envErrorLength newNrSNPs <- liftIO $ catch ( runSafeT $ do - (eigenstratIndEntries, eigenstratProd) <- getJointGenotypeData logA intersect_ inPlinkPopMode relevantPackages maybeSnpFile + eigenstratProd <- getJointGenotypeData logA intersect_ relevantPackages maybeSnpFile + let eigenstratIndEntries = jannoRows2EigenstratIndEntries . getJointJanno $ relevantPackages let newEigenstratIndEntries = map (eigenstratIndEntries !!) relevantIndices - let (outG, outS, outI) = (outPath outGeno, outPath outSnp, outPath outInd) - let outConsumer = case outFormat of - GenotypeFormatEigenstrat -> writeEigenstrat outG outS outI newEigenstratIndEntries - GenotypeFormatPlink -> writePlink outG outS outI (map (eigenstratInd2PlinkFam outPlinkPopMode) newEigenstratIndEntries) + let outConsumer = case gFileSpec of + GenotypeEigenstrat outG _ outS _ outI _ -> + writeEigenstrat (outPath outG) (outPath outS) (outPath outI) newEigenstratIndEntries + GenotypePlink outG _ outS _ outI _ -> + writePlink (outPath outG) (outPath outS) (outPath outI) (map (eigenstratInd2PlinkFam outPlinkPopMode) newEigenstratIndEntries) + _ -> liftIO . throwIO $ + PoseidonGenericException "only Outformats EIGENSTRAT or PLINK are allowed at the moment" let extractPipe = if packageWise then cat else P.map (selectIndices relevantIndices) -- define main forge pipe including file output. -- The final tee forwards the results to be used in the snpCounting-fold @@ -290,7 +301,7 @@ runForge ( ) (throwIO . PoseidonGenotypeExceptionForward errLength) logInfo "Done" return newNrSNPs - writingJannoFile :: FilePath -> String -> (VUM.MVector VUM.RealWorld Int) -> [JannoRow] -> PoseidonIO () + writingJannoFile :: FilePath -> String -> VUM.MVector VUM.RealWorld Int -> [JannoRow] -> PoseidonIO () writingJannoFile outPath outName newNrSNPs rows = do logInfo "Creating .janno file" snpList <- liftIO $ VU.freeze newNrSNPs @@ -328,7 +339,7 @@ filterBibEntries (JannoRows rows) references_ = fillMissingSnpSets :: [PoseidonPackage] -> PoseidonIO [SNPSetSpec] fillMissingSnpSets packages = forM packages $ \pac -> do let pac_ = posPacNameAndVersion pac - maybeSnpSet = snpSet . posPacGenotypeData $ pac + maybeSnpSet = genotypeSnpSet . posPacGenotypeData $ pac case maybeSnpSet of Just s -> return s Nothing -> do diff --git a/src/Poseidon/CLI/Genoconvert.hs b/src/Poseidon/CLI/Genoconvert.hs index 8bdf30cb..c98f0914 100644 --- a/src/Poseidon/CLI/Genoconvert.hs +++ b/src/Poseidon/CLI/Genoconvert.hs @@ -5,18 +5,18 @@ module Poseidon.CLI.Genoconvert where import Poseidon.EntityTypes (HasNameAndVersion (..)) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), - GenotypeFormatSpec (..), + GenotypeFileSpec (..), getFormat, loadGenotypeData, printSNPCopyProgress) +import Poseidon.Janno (jannoRows2EigenstratIndEntries) import Poseidon.Package (PackageReadOptions (..), - PoseidonException (PoseidonGenotypeExceptionForward), PoseidonPackage (..), defaultPackageReadOptions, makePseudoPackageFromGenotypeData, readPoseidonPackageCollection, writePoseidonPackage) -import Poseidon.Utils (PoseidonIO, envErrorLength, - envInputPlinkMode, envLogAction, +import Poseidon.Utils (PoseidonException (..), PoseidonIO, + envErrorLength, envLogAction, logInfo, logWarning) import Control.Exception (catch, throwIO) @@ -36,7 +36,7 @@ import System.FilePath (dropTrailingPathSeparator, (<.>), -- | A datatype representing command line options for the validate command data GenoconvertOptions = GenoconvertOptions { _genoconvertGenoSources :: [GenoDataSource] - , _genoConvertOutFormat :: GenotypeFormatSpec + , _genoConvertOutFormat :: String , _genoConvertOutOnlyGeno :: Bool , _genoMaybeOutPackagePath :: Maybe FilePath , _genoconvertRemoveOld :: Bool @@ -53,18 +53,17 @@ runGenoconvert (GenoconvertOptions genoSources outFormat onlyGeno outPath remove , _readOptOnlyLatest = onlyLatest } -- load packages - properPackages <- readPoseidonPackageCollection pacReadOpts $ [getPacBaseDirs x | x@PacBaseDir {} <- genoSources] - inPlinkPopMode <- envInputPlinkMode + properPackages <- readPoseidonPackageCollection pacReadOpts $ [getPacBaseDir x | x@PacBaseDir {} <- genoSources] pseudoPackages <- mapM makePseudoPackageFromGenotypeData [getGenoDirect x | x@GenoDirect {} <- genoSources] logInfo $ "Unpackaged genotype data files loaded: " ++ show (length pseudoPackages) -- convert - mapM_ (convertGenoTo outFormat onlyGeno outPath removeOld inPlinkPopMode outPlinkPopMode) properPackages - mapM_ (convertGenoTo outFormat True outPath removeOld inPlinkPopMode outPlinkPopMode) pseudoPackages + mapM_ (convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode) properPackages + mapM_ (convertGenoTo outFormat True outPath removeOld outPlinkPopMode) pseudoPackages -convertGenoTo :: GenotypeFormatSpec -> Bool -> Maybe FilePath -> Bool -> PlinkPopNameMode -> +convertGenoTo :: String -> Bool -> Maybe FilePath -> Bool -> PlinkPopNameMode -> PoseidonPackage -> PoseidonIO () -convertGenoTo outFormat onlyGeno outPath removeOld inPlinkPopMode outPlinkPopMode pac = do +convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode pac = do -- start message logInfo $ "Converting genotype data in " @@ -74,11 +73,12 @@ convertGenoTo outFormat onlyGeno outPath removeOld inPlinkPopMode outPlinkPopMod ++ ":" -- compile file names paths let outName = getPacName . posPacNameAndVersion $ pac - let (outInd, outSnp, outGeno) = case outFormat of - GenotypeFormatEigenstrat -> (outName <.> ".ind", outName <.> ".snp", outName <.> ".geno") - GenotypeFormatPlink -> (outName <.> ".fam", outName <.> ".bim", outName <.> ".bed") + (outInd, outSnp, outGeno) <- case outFormat of + "EIGENSTRAT" -> return (outName <.> ".ind", outName <.> ".snp", outName <.> ".geno") + "PLINK" -> return (outName <.> ".fam", outName <.> ".bim", outName <.> ".bed") + _ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment") -- check if genotype data needs conversion - if format (posPacGenotypeData pac) == outFormat + if getFormat (genotypeFileSpec (posPacGenotypeData pac)) == outFormat then logWarning "The genotype data is already in the requested format" else do -- create new genotype data files @@ -98,27 +98,33 @@ convertGenoTo outFormat onlyGeno outPath removeOld inPlinkPopMode outPlinkPopMod logA <- envLogAction currentTime <- liftIO getCurrentTime errLength <- envErrorLength + let eigenstratIndEntries = jannoRows2EigenstratIndEntries . posPacJanno $ pac liftIO $ catch ( runSafeT $ do - (eigenstratIndEntries, eigenstratProd) <- loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac) inPlinkPopMode + eigenstratProd <- loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac) let outConsumer = case outFormat of - GenotypeFormatEigenstrat -> writeEigenstrat outG outS outI eigenstratIndEntries - GenotypeFormatPlink -> writePlink outG outS outI (map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries) + "EIGENSTRAT" -> writeEigenstrat outG outS outI eigenstratIndEntries + "PLINK" -> writePlink outG outS outI (map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries) + _ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment") runEffect $ eigenstratProd >-> printSNPCopyProgress logA currentTime >-> outConsumer ) (throwIO . PoseidonGenotypeExceptionForward errLength) logInfo "Done" -- overwrite genotype data field in POSEIDON.yml file unless (onlyGeno || isJust outPath) $ do - let genotypeData = GenotypeDataSpec outFormat outGeno Nothing outSnp Nothing outInd Nothing (snpSet . posPacGenotypeData $ pac) + gFileSpec <- case outFormat of + "EIGENSTRAT" -> return $ GenotypeEigenstrat outGeno Nothing outSnp Nothing outInd Nothing + "PLINK" -> return $ GenotypePlink outGeno Nothing outSnp Nothing outInd Nothing + _ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment") + let genotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac) newPac = pac { posPacGenotypeData = genotypeData } logInfo "Adjusting POSEIDON.yml..." liftIO $ writePoseidonPackage newPac -- delete now replaced input genotype data - when removeOld $ liftIO $ mapM_ removeFile [ - posPacBaseDir pac genoFile (posPacGenotypeData pac) - , posPacBaseDir pac snpFile (posPacGenotypeData pac) - , posPacBaseDir pac indFile (posPacGenotypeData pac) - ] + let filesToDelete = case genotypeFileSpec . posPacGenotypeData $ pac of + GenotypeEigenstrat g _ s _ i _ -> [g, s, i] + GenotypePlink g _ s _ i _ -> [g, s, i] + GenotypeVCF g _ -> [g] + when removeOld . liftIO . mapM_ (removeFile . (posPacBaseDir pac )) $ filesToDelete where checkFile :: FilePath -> PoseidonIO Bool checkFile fn = do diff --git a/src/Poseidon/CLI/Init.hs b/src/Poseidon/CLI/Init.hs index 13829271..87185ef7 100644 --- a/src/Poseidon/CLI/Init.hs +++ b/src/Poseidon/CLI/Init.hs @@ -3,7 +3,9 @@ module Poseidon.CLI.Init where import Poseidon.BibFile (dummyBibEntry, writeBibTeXFile) -import Poseidon.GenotypeData (GenotypeDataSpec (..), loadIndividuals) +import Poseidon.GenotypeData (GenotypeDataSpec (..), + GenotypeFileSpec (..), loadIndividuals, + reduceGenotypeFilepaths) import Poseidon.Janno (writeJannoFile) import Poseidon.Package (PoseidonPackage (..), newMinimalPackageTemplate, @@ -12,7 +14,7 @@ import Poseidon.Package (PoseidonPackage (..), import Poseidon.Utils (PoseidonIO, checkFile, determinePackageOutName, logInfo) -import Control.Monad (unless) +import Control.Monad (forM_, unless) import Control.Monad.IO.Class (liftIO) import System.Directory (copyFile, createDirectoryIfMissing) import System.FilePath (dropTrailingPathSeparator, @@ -26,32 +28,30 @@ data InitOptions = InitOptions } runInit :: InitOptions -> PoseidonIO () -runInit (InitOptions gd outPathRaw maybeOutName minimal) = do - let (GenotypeDataSpec format_ genoFile_ _ snpFile_ _ indFile_ _ snpSet_) = gd +runInit (InitOptions genotypeDataIn outPathRaw maybeOutName minimal) = do -- create new directory let outPath = dropTrailingPathSeparator outPathRaw logInfo $ "Creating new package directory: " ++ outPath liftIO $ createDirectoryIfMissing True outPath -- compile genotype data structure - let outInd = takeFileName indFile_ - outSnp = takeFileName snpFile_ - outGeno = takeFileName genoFile_ - genotypeData = GenotypeDataSpec format_ outGeno Nothing outSnp Nothing outInd Nothing snpSet_ + genotypeDataOut <- snd <$> reduceGenotypeFilepaths genotypeDataIn -- genotype data logInfo "Copying genotype data" - liftIO $ checkFile indFile_ Nothing - liftIO $ checkFile snpFile_ Nothing - liftIO $ checkFile genoFile_ Nothing - liftIO $ copyFile indFile_ $ outPath outInd - liftIO $ copyFile snpFile_ $ outPath outSnp - liftIO $ copyFile genoFile_ $ outPath outGeno + let sourceFiles = case genotypeFileSpec genotypeDataIn of + GenotypeEigenstrat genoFile _ snpFile _ indFile _ -> [genoFile, snpFile, indFile] + GenotypePlink genoFile _ snpFile _ indFile _ -> [genoFile, snpFile, indFile] + GenotypeVCF vcfFile _ -> [vcfFile] + forM_ sourceFiles $ \sourceFile -> do + liftIO $ checkFile sourceFile Nothing + let targetFile = outPath takeFileName sourceFile + liftIO $ copyFile sourceFile targetFile -- create new package logInfo "Creating new package entity" outName <- liftIO $ determinePackageOutName maybeOutName outPath - inds <- loadIndividuals outPath genotypeData + inds <- loadIndividuals outPath genotypeDataOut pac <- if minimal - then return $ newMinimalPackageTemplate outPath outName genotypeData - else newPackageTemplate outPath outName genotypeData (Just (Left inds)) mempty [dummyBibEntry] + then newMinimalPackageTemplate outPath outName genotypeDataOut + else newPackageTemplate outPath outName genotypeDataOut (Just (Left inds)) mempty [dummyBibEntry] -- POSEIDON.yml logInfo "Creating POSEIDON.yml" liftIO $ writePoseidonPackage pac diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index f0f458b4..807f86e2 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -19,7 +19,7 @@ import Poseidon.EntityTypes (EntitiesList, EntityInput (..), readEntitiesFromString) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), - GenotypeFormatSpec (..), + GenotypeFileSpec (..), SNPSetSpec (..)) import Poseidon.ServerClient (AddJannoColSpec (..), ArchiveEndpoint (..)) @@ -32,6 +32,7 @@ import Poseidon.Version (VersionComponent (..), import Control.Applicative ((<|>)) import qualified Data.ByteString.Char8 as BSC +import Data.List (intercalate) import Data.List.Split (splitOn) import Data.Version (Version) import qualified Options.Applicative as OP @@ -355,27 +356,22 @@ parseRemoteDummy = OP.flag' () ( OP.long "remote" <> OP.help "List packages from a remote server instead the local file system.") -parseOutGenotypeFormat :: Bool -> OP.Parser GenotypeFormatSpec +parseOutGenotypeFormat :: Bool -> OP.Parser String parseOutGenotypeFormat withDefault = if withDefault - then OP.option (OP.eitherReader readGenotypeFormat) settingsWithDefault - else OP.option (OP.eitherReader readGenotypeFormat) settingsWithoutDefault + then OP.strOption settingsWithDefault + else OP.strOption settingsWithoutDefault where settingsWithDefault = OP.long "outFormat" <> OP.metavar "FORMAT" <> OP.help "The format of the output genotype data: EIGENSTRAT or PLINK." <> - OP.value GenotypeFormatPlink <> + OP.value "PLINK" <> OP.showDefault settingsWithoutDefault = OP.long "outFormat" <> OP.metavar "FORMAT" <> OP.help "the format of the output genotype data: EIGENSTRAT or PLINK." - readGenotypeFormat :: String -> Either String GenotypeFormatSpec - readGenotypeFormat s = case s of - "EIGENSTRAT" -> Right GenotypeFormatEigenstrat - "PLINK" -> Right GenotypeFormatPlink - _ -> Left "must be EIGENSTRAT or PLINK" parseGenoDataSources :: OP.Parser [GenoDataSource] parseGenoDataSources = OP.some parseGenoDataSource @@ -438,20 +434,12 @@ parseBasePath = OP.strOption ( OP.help "A base directory to search for Poseidon packages.") parseInGenoWithoutSNPSet :: OP.Parser GenotypeDataSpec -parseInGenoWithoutSNPSet = createGeno <$> (parseInGenoOne <|> parseInGenoSep) - where - createGeno :: GenoInput -> GenotypeDataSpec - createGeno (a,b,c,d) = GenotypeDataSpec a b Nothing c Nothing d Nothing Nothing +parseInGenoWithoutSNPSet = GenotypeDataSpec <$> (parseInGenoOne <|> parseInGenoSep) <*> pure Nothing parseInGenotypeDataset :: OP.Parser GenotypeDataSpec -parseInGenotypeDataset = createGeno <$> (parseInGenoOne <|> parseInGenoSep) <*> parseGenotypeSNPSet - where - createGeno :: GenoInput -> Maybe SNPSetSpec -> GenotypeDataSpec - createGeno (a,b,c,d) = GenotypeDataSpec a b Nothing c Nothing d Nothing - -type GenoInput = (GenotypeFormatSpec, FilePath, FilePath, FilePath) +parseInGenotypeDataset = GenotypeDataSpec <$> (parseInGenoOne <|> parseInGenoSep) <*> (Just <$> parseGenotypeSNPSet) -parseInGenoOne :: OP.Parser GenoInput +parseInGenoOne :: OP.Parser GenotypeFileSpec parseInGenoOne = OP.option (OP.eitherReader readGenoInput) ( OP.long "genoOne" <> OP.short 'p' <> @@ -462,60 +450,68 @@ parseInGenoOne = OP.option (OP.eitherReader readGenoInput) ( \The other files must be in the same directory \ \and must have the same base name. If a gzipped file is given, it is assumed that the \ \file pairs (.geno.gz, .snp.gz) or (.bim.gz, .bed.gz) are both zipped, but not the .fam or .ind file. \ - \If a .ind or .fam file is given, it is assumed that none of the file triples is zipped.") + \If a .ind or .fam file is given, it is assumed that none of the file triples is zipped. \ + \For VCF please see option --vcfFile") where - readGenoInput :: FilePath -> Either String GenoInput + readGenoInput :: FilePath -> Either String GenotypeFileSpec readGenoInput p = makeGenoInput (dropExtensions p) (takeExtensions p) makeGenoInput path ext - | ext `elem` [".geno", ".snp", ".ind"] = Right (GenotypeFormatEigenstrat, path <.> ".geno", path <.> ".snp", path <.> ".ind") - | ext `elem` [".geno.gz", ".snp.gz" ] = Right (GenotypeFormatEigenstrat, path <.> ".geno.gz", path <.> ".snp.gz", path <.> ".ind") - | ext `elem` [".bed", ".bim", ".fam"] = Right (GenotypeFormatPlink, path <.> ".bed", path <.> ".bim", path <.> ".fam") - | ext `elem` [".bed.gz", ".bim.gz" ] = Right (GenotypeFormatPlink, path <.> ".bed.gz", path <.> ".bim.gz", path <.> ".fam") - | otherwise = Left $ "unknown file extension: " ++ ext - -parseInGenoSep :: OP.Parser GenoInput -parseInGenoSep = (,,,) <$> parseInGenotypeFormat <*> parseInGenoFile <*> parseInSnpFile <*> parseInIndFile - -parseInGenotypeFormat :: OP.Parser GenotypeFormatSpec -parseInGenotypeFormat = OP.option (OP.eitherReader readGenotypeFormat) ( - OP.long "inFormat" <> - OP.metavar "FORMAT" <> - OP.help "The format of the input genotype data: EIGENSTRAT or PLINK. \ - \Only necessary for data input with --genoFile + --snpFile + --indFile.") + | ext `elem` [".geno", ".snp", ".ind"] = + Right $ GenotypeEigenstrat (path <.> ".geno") Nothing + (path <.> ".snp") Nothing + (path <.> ".ind") Nothing + | ext `elem` [".geno.gz", ".snp.gz" ] = + Right $ GenotypeEigenstrat (path <.> ".geno.gz") Nothing + (path <.> ".snp.gz") Nothing + (path <.> ".ind") Nothing + | ext `elem` [".bed", ".bim", ".fam"] = + Right $ GenotypePlink (path <.> ".bed") Nothing + (path <.> ".bim") Nothing + (path <.> ".fam") Nothing + | ext `elem` [".bed.gz", ".bim.gz" ] = + Right $ GenotypePlink (path <.> ".bed.gz") Nothing + (path <.> ".bim.gz") Nothing + (path <.> ".fam") Nothing + | otherwise = Left $ "unknown file extension: " ++ ext + +parseInGenoSep :: OP.Parser GenotypeFileSpec +parseInGenoSep = parseEigenstrat <|> parsePlink <|> parseVCF where - readGenotypeFormat :: String -> Either String GenotypeFormatSpec - readGenotypeFormat s = case s of - "EIGENSTRAT" -> Right GenotypeFormatEigenstrat - "PLINK" -> Right GenotypeFormatPlink - _ -> Left "must be EIGENSTRAT or PLINK" - -parseInGenoFile :: OP.Parser FilePath -parseInGenoFile = OP.strOption ( - OP.long "genoFile" <> - OP.metavar "FILE" <> - OP.help "Path to the input geno file.") - -parseInSnpFile :: OP.Parser FilePath -parseInSnpFile = OP.strOption ( - OP.long "snpFile" <> - OP.metavar "FILE" <> - OP.help "Path to the input snp file.") - -parseInIndFile :: OP.Parser FilePath -parseInIndFile = OP.strOption ( - OP.long "indFile" <> - OP.metavar "FILE" <> - OP.help "Path to the input ind file.") + parseEigenstrat = GenotypeEigenstrat <$> + parseFileWithEndings "Eigenstrat genotype matrix, optionally gzipped" "genoFile" [".geno", ".geno.gz"] <*> + pure Nothing <*> + parseFileWithEndings "Eigenstrat snp positions file" "snpFile, optionally gzipped" [".snp", ".snp.gz"] <*> + pure Nothing <*> + parseFileWithEndings "Eigenstrat individual file" "indFile" [".ind"] <*> + pure Nothing + parsePlink = GenotypeEigenstrat <$> + parseFileWithEndings "Plink genotype matrix, optionally gzipped" "bedFile" [".bed", ".bed.gz"] <*> + pure Nothing <*> + parseFileWithEndings "Plink snp positions file" "bimFile, optionally gzipped" [".bim", ".bim.gz"] <*> + pure Nothing <*> + parseFileWithEndings "Plink individual file" "famFile" [".fam"] <*> pure Nothing + parseVCF = GenotypeVCF <$> + parseFileWithEndings "VCF (Variant Call Format) file, optionall gzipped" "vcfFile" [".vcf", ".vcf.gz"] <*> + pure Nothing + +parseFileWithEndings :: String -> String -> [String] -> OP.Parser FilePath +parseFileWithEndings help long endings = OP.option (OP.maybeReader fileEndingReader) ( + OP.long long <> + OP.help (help ++ ". Accepted file endings are " ++ intercalate ", " endings) <> + OP.metavar "FILE") + where + fileEndingReader :: String -> Maybe FilePath + fileEndingReader optString = if takeExtensions optString `elem` endings then Just (dropExtensions optString) else Nothing -parseGenotypeSNPSet :: OP.Parser (Maybe SNPSetSpec) -parseGenotypeSNPSet = OP.option (Just <$> OP.eitherReader readSnpSet) ( +parseGenotypeSNPSet :: OP.Parser SNPSetSpec +parseGenotypeSNPSet = OP.option (OP.eitherReader readSnpSet) ( OP.long "snpSet" <> OP.metavar "SET" <> OP.help "The snpSet of the package: 1240K, HumanOrigins or Other. \ \Only relevant for data input with -p|--genoOne or --genoFile + --snpFile + --indFile, \ \because the packages in a -d|--baseDir already have this information in their respective \ \POSEIDON.yml files. (default: Other)" <> - OP.value (Just SNPSetOther)) + OP.value SNPSetOther) where readSnpSet :: String -> Either String SNPSetSpec readSnpSet s = case s of diff --git a/src/Poseidon/CLI/Rectify.hs b/src/Poseidon/CLI/Rectify.hs index 766b4ca6..f1b9ec40 100644 --- a/src/Poseidon/CLI/Rectify.hs +++ b/src/Poseidon/CLI/Rectify.hs @@ -8,7 +8,8 @@ import Poseidon.Contributor (ContributorSpec (..)) import Poseidon.EntityTypes (HasNameAndVersion (..), PacNameAndVersion (..), renderNameWithVersion) -import Poseidon.GenotypeData (GenotypeDataSpec (..)) +import Poseidon.GenotypeData (GenotypeDataSpec (..), + GenotypeFileSpec (..)) import Poseidon.Package (PackageReadOptions (..), PoseidonPackage (..), defaultPackageReadOptions, @@ -20,7 +21,7 @@ import Poseidon.Version (VersionComponent (..), updateThreeComponentVersion) import Control.DeepSeq ((<$!!>)) -import Control.Monad.IO.Class (liftIO) +import Control.Monad.IO.Class (MonadIO, liftIO) import Data.List (nub) import Data.Maybe (fromJust) import Data.Time (UTCTime (..), getCurrentTime) @@ -98,29 +99,24 @@ updateChecksums checksumSetting pac = do update :: Bool -> Bool -> Bool -> Bool -> PoseidonIO PoseidonPackage update g j s b = do let d = posPacBaseDir pac - newGenotypeDataSection <- + let gFileSpec = genotypeFileSpec . posPacGenotypeData $ pac + newGenotypeFileSpec <- if g then do logDebug "Updating genotype data checksums" - let gd = posPacGenotypeData pac - genoExists <- exists (d genoFile gd) - genoChkSum <- if genoExists - then Just <$!!> getChk (d genoFile gd) - else return $ genoFileChkSum gd - snpExists <- exists (d snpFile gd) - snpChkSum <- if snpExists - then Just <$!!> getChk (d snpFile gd) - else return $ snpFileChkSum gd - indExists <- exists (d indFile gd) - indChkSum <- if indExists - then Just <$!!> getChk (d indFile gd) - else return $ indFileChkSum gd - return $ gd { - genoFileChkSum = genoChkSum, - snpFileChkSum = snpChkSum, - indFileChkSum = indChkSum - } - else return $ posPacGenotypeData pac + case gFileSpec of + GenotypeEigenstrat gf gfc sf sfc if_ ifc -> do + [genoChkSum, snpChkSum, indChkSum] <- + sequence [testAndGetChecksum f c | (f, c) <- zip [gf, sf, if_] [gfc, sfc, ifc]] + return $ GenotypeEigenstrat gf genoChkSum sf snpChkSum if_ indChkSum + GenotypePlink gf gfc sf sfc if_ ifc -> do + [genoChkSum, snpChkSum, indChkSum] <- + sequence [testAndGetChecksum f c | (f, c) <- zip [gf, sf, if_] [gfc, sfc, ifc]] + return $ GenotypePlink gf genoChkSum sf snpChkSum if_ indChkSum + GenotypeVCF gf gfc -> do + genoChkSum <- testAndGetChecksum gf gfc + return $ GenotypeVCF gf genoChkSum + else return gFileSpec newJannoChkSum <- if j then do @@ -145,14 +141,21 @@ updateChecksums checksumSetting pac = do Nothing -> return $ posPacBibFileChkSum pac Just fn -> Just <$!!> getChk (d fn) else return $ posPacBibFileChkSum pac + let gd = posPacGenotypeData pac return $ pac { - posPacGenotypeData = newGenotypeDataSection, + posPacGenotypeData = gd {genotypeFileSpec = newGenotypeFileSpec}, posPacJannoFileChkSum = newJannoChkSum, posPacSeqSourceFileChkSum = newSeqSourceChkSum, posPacBibFileChkSum = newBibChkSum } + getChk :: (MonadIO m) => FilePath -> m String getChk = liftIO . getChecksum - exists = liftIO . doesFileExist + testAndGetChecksum :: (MonadIO m) => FilePath -> Maybe String -> m (Maybe String) + testAndGetChecksum file defaultChkSum = do + e <- liftIO . doesFileExist $ file + if e then Just <$!!> getChk file else return defaultChkSum + + completeAndWritePackage :: Maybe PackageVersionUpdate -> PoseidonPackage -> PoseidonIO () completeAndWritePackage Nothing pac = do diff --git a/src/Poseidon/CLI/Serve.hs b/src/Poseidon/CLI/Serve.hs index 9377df8f..468faff3 100644 --- a/src/Poseidon/CLI/Serve.hs +++ b/src/Poseidon/CLI/Serve.hs @@ -6,7 +6,8 @@ module Poseidon.CLI.Serve (runServer, runServerMainThread, ServeOptions(..)) whe import Poseidon.EntityTypes (HasNameAndVersion (..), PacNameAndVersion, renderNameWithVersion) -import Poseidon.GenotypeData (GenotypeDataSpec (..)) +import Poseidon.GenotypeData (GenotypeDataSpec (..), + GenotypeFileSpec (..)) import Poseidon.Package (PackageReadOptions (..), PoseidonPackage (..), defaultPackageReadOptions, @@ -25,7 +26,7 @@ import Codec.Archive.Zip (Archive, addEntryToArchive, emptyArchive, fromArchive, toEntry) import Control.Concurrent.MVar (MVar, newEmptyMVar, putMVar) -import Control.Monad (forM, when) +import Control.Monad (foldM, forM, when) import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Lazy as B import Data.List (nub, sortOn) @@ -229,12 +230,11 @@ checkZipFileOutdated pac fn = do Just fn_ -> checkOutdated zipModTime (posPacBaseDir pac fn_) Nothing -> return False let gd = posPacGenotypeData pac - genoOutdated <- checkOutdated zipModTime (posPacBaseDir pac genoFile gd) - snpOutdated <- checkOutdated zipModTime (posPacBaseDir pac snpFile gd) - indOutdated <- checkOutdated zipModTime (posPacBaseDir pac indFile gd) - return $ or [yamlOutdated, bibOutdated, jannoOutdated, readmeOutdated, - changelogOutdated, genoOutdated, snpOutdated, indOutdated, - ssfOutdated] + genoFilesOutdated <- case genotypeFileSpec gd of + GenotypeEigenstrat gf _ sf _ i _ -> mapM (checkOutdated zipModTime . (posPacBaseDir pac )) [gf, sf, i] + GenotypePlink gf _ sf _ i _ -> mapM (checkOutdated zipModTime . (posPacBaseDir pac )) [gf, sf, i] + GenotypeVCF gf _ -> mapM (checkOutdated zipModTime . (posPacBaseDir pac )) [gf] + return . or $ [yamlOutdated, bibOutdated, jannoOutdated, readmeOutdated, changelogOutdated, ssfOutdated] ++ genoFilesOutdated else return True where @@ -242,30 +242,21 @@ checkZipFileOutdated pac fn = do makeZipArchive :: PoseidonPackage -> IO Archive makeZipArchive pac = - addYaml emptyArchive >>= addJanno >>= addBib >>= addReadme >>= addChangelog >>= addInd >>= addSnp >>= addGeno >>= addSSF + addYaml emptyArchive >>= addJanno >>= addBib >>= addReadme >>= addChangelog >>= addGenos >>= addSSF where - addYaml = addFN "POSEIDON.yml" (posPacBaseDir pac) - addJanno = case posPacJannoFile pac of - Nothing -> return - Just fn -> addFN fn (posPacBaseDir pac) - addBib = case posPacBibFile pac of - Nothing -> return - Just fn -> addFN fn (posPacBaseDir pac) - addReadme = case posPacReadmeFile pac of - Nothing -> return - Just fn -> addFN fn (posPacBaseDir pac) - addChangelog = case posPacChangelogFile pac of - Nothing -> return - Just fn -> addFN fn (posPacBaseDir pac) - addSSF = case posPacSeqSourceFile pac of - Nothing -> return - Just fn -> addFN fn (posPacBaseDir pac) - addInd = addFN (indFile . posPacGenotypeData $ pac) (posPacBaseDir pac) - addSnp = addFN (snpFile . posPacGenotypeData $ pac) (posPacBaseDir pac) - addGeno = addFN (genoFile . posPacGenotypeData $ pac) (posPacBaseDir pac) - addFN :: FilePath -> FilePath -> Archive -> IO Archive - addFN fn baseDir a = do - let fullFN = baseDir fn + addYaml = addFN "POSEIDON.yml" + addJanno = maybe return addFN (posPacJannoFile pac) + addBib = maybe return addFN (posPacBibFile pac) + addReadme = maybe return addFN (posPacReadmeFile pac) + addChangelog = maybe return addFN (posPacChangelogFile pac) + addSSF = maybe return addFN (posPacSeqSourceFile pac) + addGenos archive = case genotypeFileSpec . posPacGenotypeData $ pac of + GenotypeEigenstrat gf _ sf _ i _ -> foldM (flip addFN) archive [gf, sf, i] + GenotypePlink gf _ sf _ i _ -> foldM (flip addFN) archive [gf, sf, i] + GenotypeVCF gf _ -> addFN gf archive + addFN :: FilePath -> Archive -> IO Archive + addFN fn a = do + let fullFN = posPacBaseDir pac fn raw <- B.readFile fullFN modTime <- round . utcTimeToPOSIXSeconds <$> getModificationTime fullFN let zipEntry = toEntry fn modTime raw diff --git a/src/Poseidon/CLI/Survey.hs b/src/Poseidon/CLI/Survey.hs index a5439932..fed67e58 100644 --- a/src/Poseidon/CLI/Survey.hs +++ b/src/Poseidon/CLI/Survey.hs @@ -8,7 +8,8 @@ module Poseidon.CLI.Survey where import Poseidon.BibFile (BibTeX) -import Poseidon.GenotypeData (GenotypeDataSpec (..)) +import Poseidon.GenotypeData (GenotypeDataSpec (..), + GenotypeFileSpec (..)) import Poseidon.Janno (CsvNamedRecord, GeneticSex, JannoRows (..), ListColumn (..)) import Poseidon.Package (PackageReadOptions (..), @@ -53,11 +54,11 @@ runSurvey (SurveyOptions baseDirs rawOutput onlyLatest) = do -- collect information let packageNames = map posPacNameAndVersion allPackages -- geno - let genotypeDataTuples = [(posPacBaseDir pac, posPacGenotypeData pac) | pac <- allPackages] - genoFilesExist <- liftIO $ sequence [doesFileExist (d genoFile gd) | (d, gd) <- genotypeDataTuples] - snpFilesExist <- liftIO $ sequence [doesFileExist (d snpFile gd) | (d, gd) <- genotypeDataTuples] - indFilesExist <- liftIO $ sequence [doesFileExist (d indFile gd) | (d, gd) <- genotypeDataTuples] - let genoTypeDataExists = map (\(a,b,c) -> a && b && c) $ zip3 genoFilesExist snpFilesExist indFilesExist + genoTypeDataExists <- forM allPackages $ \pac -> do + case genotypeFileSpec . posPacGenotypeData $ pac of + GenotypeEigenstrat gf _ sf _ i _ -> and <$> mapM (liftIO . doesFileExist . (posPacBaseDir pac )) [gf, sf, i] + GenotypePlink gf _ sf _ i _ -> and <$> mapM (liftIO . doesFileExist . (posPacBaseDir pac )) [gf, sf, i] + GenotypeVCF gf _ -> liftIO . doesFileExist $ posPacBaseDir pac gf -- janno let jannos = map posPacJanno allPackages -- ssf diff --git a/src/Poseidon/CLI/Validate.hs b/src/Poseidon/CLI/Validate.hs index 1504abba..318938ab 100644 --- a/src/Poseidon/CLI/Validate.hs +++ b/src/Poseidon/CLI/Validate.hs @@ -4,7 +4,8 @@ module Poseidon.CLI.Validate where import Poseidon.BibFile (readBibTeXFile) -import Poseidon.GenotypeData (GenotypeDataSpec (..)) +import Poseidon.GenotypeData (GenotypeDataSpec (..), + GenotypeFileSpec (..)) import Poseidon.Janno (JannoRows (..), readJannoFile) import Poseidon.Package (PackageReadOptions (..), PoseidonException (..), @@ -90,7 +91,11 @@ runValidate (ValidateOptions (ValPlanPoseidonYaml path) noExitCode _) = do logInfo $ "Read .yml file of package " ++ _posYamlTitle yml conclude True noExitCode runValidate (ValidateOptions (ValPlanGeno geno) noExitCode _) = do - logInfo $ "Validating: " ++ genoFile geno + let gFile = case genotypeFileSpec geno of + GenotypeEigenstrat gf _ _ _ _ _ -> gf + GenotypePlink gf _ _ _ _ _ -> gf + GenotypeVCF gf _ -> gf + logInfo $ "Validating: " ++ gFile pac <- makePseudoPackageFromGenotypeData geno validateGeno pac True conclude True noExitCode diff --git a/src/Poseidon/GenotypeData.hs b/src/Poseidon/GenotypeData.hs index 60efbd34..ab0fa5d4 100644 --- a/src/Poseidon/GenotypeData.hs +++ b/src/Poseidon/GenotypeData.hs @@ -7,7 +7,8 @@ import Poseidon.Utils (LogA, PoseidonException (..), logInfo, logWithEnv, padLeft) import Control.Exception (throwIO) -import Control.Monad (forM) +import Control.Monad (forM, unless) +import Control.Monad.Catch (MonadThrow, throwM) import Control.Monad.IO.Class (MonadIO, liftIO) import Data.Aeson (FromJSON, ToJSON, object, parseJSON, toJSON, withObject, @@ -20,91 +21,114 @@ import qualified Data.Text as T import Data.Time (NominalDiffTime, UTCTime, diffUTCTime, getCurrentTime) import qualified Data.Vector as V -import Pipes (Pipe, Producer, cat, for, yield) -import Pipes.Safe (MonadSafe) +import Pipes (Pipe, Producer, cat, for, yield, + (>->)) +import Pipes.Safe (MonadSafe, runSafeT) import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), EigenstratSnpEntry (..), - GenoEntry (..), GenoLine, + GenoEntry (..), GenoLine, Sex (..), readEigenstrat, readEigenstratInd) -import SequenceFormats.Plink (PlinkPopNameMode, - plinkFam2EigenstratInd, +import SequenceFormats.FreqSum (FreqSumEntry (..)) +import SequenceFormats.Plink (plinkFam2EigenstratInd, readFamFile, readPlink) -import System.FilePath (()) +import SequenceFormats.VCF (VCFentry (..), VCFheader (..), + readVCFfromFile, vcfToFreqSumEntry) +import System.FilePath (takeDirectory, takeFileName, ()) data GenoDataSource = PacBaseDir - { getPacBaseDirs :: FilePath + { getPacBaseDir :: FilePath } | GenoDirect { getGenoDirect :: GenotypeDataSpec } deriving Show --- | A datatype to specify genotype files -data GenotypeDataSpec = GenotypeDataSpec - { format :: GenotypeFormatSpec - -- ^ the genotype format - , genoFile :: FilePath - -- ^ path to the geno file - , genoFileChkSum :: Maybe String - -- ^ the optional checksum for the geno file - , snpFile :: FilePath - -- ^ path to the snp file - , snpFileChkSum :: Maybe String - -- ^ the optional checksum for the Snp file - , indFile :: FilePath - -- ^ path to the ind file - , indFileChkSum :: Maybe String - -- ^ the optional checksum for the indfile - , snpSet :: Maybe SNPSetSpec - -- ^ the SNP set de facto listed in the genotype data - } - deriving (Show, Eq) +data GenotypeDataSpec = GenotypeDataSpec { + genotypeFileSpec :: GenotypeFileSpec, + genotypeSnpSet :: Maybe SNPSetSpec +} deriving (Show, Eq) + +data GenotypeFileSpec = GenotypeEigenstrat { + _esGenoFile :: FilePath, + _esGenoFileChkSum :: Maybe String, + _esSnpFile :: FilePath, + _esSnpFileChkSum :: Maybe String, + _esIndFile :: FilePath, + _esIndFileChkSum :: Maybe String +} | GenotypePlink { + _plGenoFile :: FilePath, + _plGenoFileChkSum :: Maybe String, + _plSnpFile :: FilePath, + _plSnpFileChkSum :: Maybe String, + _plIndFile :: FilePath, + _plIndFileChkSum :: Maybe String +} | GenotypeVCF { + _vcfGenoFile :: FilePath, + _vcfGenoFileChkSum :: Maybe String +} deriving (Show, Eq) + +getFormat :: GenotypeFileSpec -> String +getFormat (GenotypeEigenstrat _ _ _ _ _ _) = "EIGENSTRAT" +getFormat (GenotypePlink _ _ _ _ _ _) = "PLINK" +getFormat (GenotypeVCF _ _ ) = "VCF" -- | To facilitate automatic parsing of GenotypeDataSpec from JSON files instance FromJSON GenotypeDataSpec where - parseJSON = withObject "GenotypeData" $ \v -> GenotypeDataSpec - <$> v .: "format" - <*> v .: "genoFile" - <*> v .:? "genoFileChkSum" - <*> v .: "snpFile" - <*> v .:? "snpFileChkSum" - <*> v .: "indFile" - <*> v .:? "indFileChkSum" - <*> v .:? "snpSet" + parseJSON = withObject "GenotypeData" $ \v -> do + gformat <- v .: "format" + gfileSpec <- case gformat of + "EIGENSTRAT" -> GenotypeEigenstrat + <$> v .: "genoFile" + <*> v .:? "genoFileChkSum" + <*> v .: "snpFile" + <*> v .:? "snpFileChkSum" + <*> v .: "indFile" + <*> v .:? "indFileChkSum" + "PLINK" -> GenotypePlink + <$> v .: "genoFile" + <*> v .:? "genoFileChkSum" + <*> v .: "snpFile" + <*> v .:? "snpFileChkSum" + <*> v .: "indFile" + <*> v .:? "indFileChkSum" + "VCF" -> GenotypeVCF + <$> v .: "genoFile" + <*> v .:? "genoFileChkSum" + _ -> fail ("unknown format " ++ T.unpack gformat) + snpSet <- v .:? "snpSet" + return $ GenotypeDataSpec gfileSpec snpSet instance ToJSON GenotypeDataSpec where -- this encodes directly to a bytestring Builder - toJSON x = object [ - "format" .= format x, - "genoFile" .= genoFile x, - "genoFileChkSum".= genoFileChkSum x, - "snpFile" .= snpFile x, - "snpFileChkSum" .= snpFileChkSum x, - "indFile" .= indFile x, - "indFileChkSum" .= indFileChkSum x, - "snpSet" .= snpSet x - ] - --- | A data type representing the options fo the genotype format -data GenotypeFormatSpec = GenotypeFormatEigenstrat - | GenotypeFormatPlink - deriving (Eq) - -instance Show GenotypeFormatSpec where - show GenotypeFormatPlink = "PLINK" - show GenotypeFormatEigenstrat = "EIGENSTRAT" - --- | To facilitate automatic parsing of GenotypeFormatSpec from JSON files -instance FromJSON GenotypeFormatSpec where - parseJSON = withText "format" $ \v -> case v of - "EIGENSTRAT" -> pure GenotypeFormatEigenstrat - "PLINK" -> pure GenotypeFormatPlink - _ -> fail ("unknown format " ++ T.unpack v) - -instance ToJSON GenotypeFormatSpec where - toJSON a = case a of - GenotypeFormatPlink -> "PLINK" - GenotypeFormatEigenstrat -> "EIGENSTRAT" + toJSON (GenotypeDataSpec gfileSpec snpSet) = case gfileSpec of + GenotypeEigenstrat genoF genoFchk snpF snpFchk indF indFchk -> + object [ + "format" .= ("EIGENSTRAT" :: String), + "genoFile" .= genoF, + "genoFileChkSum".= genoFchk, + "snpFile" .= snpF, + "snpFileChkSum" .= snpFchk, + "indFile" .= indF, + "indFileChkSum" .= indFchk, + "snpSet" .= snpSet + ] + GenotypePlink genoF genoFchk snpF snpFchk indF indFchk -> + object [ + "format" .= ("PLINK" :: String), + "genoFile" .= genoF, + "genoFileChkSum".= genoFchk, + "snpFile" .= snpF, + "snpFileChkSum" .= snpFchk, + "indFile" .= indF, + "indFileChkSum" .= indFchk, + "snpSet" .= snpSet + ] + GenotypeVCF genoF genoFchk -> + object [ + "format" .= ("VCF" :: String), + "genoFile" .= genoF, + "genoFileChkSum".= genoFchk + ] data SNPSetSpec = SNPSet1240K | SNPSetHumanOrigins @@ -143,30 +167,75 @@ snpSetMerge SNPSetHumanOrigins SNPSet1240K True = SNPSetHumanOrigins snpSetMerge SNPSet1240K SNPSetHumanOrigins False = SNPSet1240K snpSetMerge SNPSetHumanOrigins SNPSet1240K False = SNPSet1240K +-- | removes directories of all filenames and returns a tuple of the basename and a modified GenotypeDataSpec with pure filenames +-- In case basedirectories do not match, this function will throw an exception +reduceGenotypeFilepaths :: (MonadThrow m) => GenotypeDataSpec -> m (FilePath, GenotypeDataSpec) +reduceGenotypeFilepaths gd@(GenotypeDataSpec gFileSpec _) = do + (baseDir, newGfileSpec) <- case gFileSpec of + GenotypeEigenstrat genoF _ snpF _ indF _ -> do + let baseDirs = map takeDirectory [genoF, snpF, indF] + fileNames = map takeFileName [genoF, snpF, indF] + unless (all (==(head baseDirs)) baseDirs) . throwM $ PoseidonUnequalBaseDirException genoF snpF indF + return (head baseDirs, gFileSpec {_esGenoFile = fileNames !! 0, _esSnpFile = fileNames !! 1, _esIndFile = fileNames !! 2}) + GenotypePlink genoF _ snpF _ indF _ -> do + let baseDirs = map takeDirectory [genoF, snpF, indF] + fileNames = map takeFileName [genoF, snpF, indF] + unless (all (==(head baseDirs)) baseDirs) . throwM $ PoseidonUnequalBaseDirException genoF snpF indF + return (head baseDirs, gFileSpec {_plGenoFile = fileNames !! 0, _plSnpFile = fileNames !! 1, _plIndFile = fileNames !! 2}) + GenotypeVCF genoF _ -> do + let baseDir = takeDirectory genoF + fileName = takeFileName genoF + return (baseDir, gFileSpec {_vcfGenoFile = fileName}) + return (baseDir, gd {genotypeFileSpec = newGfileSpec}) + -- | A function to return a list of all individuals in the genotype files of a package. loadIndividuals :: FilePath -- ^ the base directory -> GenotypeDataSpec -- ^ the Genotype spec -> PoseidonIO [EigenstratIndEntry] -- ^ the returned list of EigenstratIndEntries. -loadIndividuals d gd = do +loadIndividuals d (GenotypeDataSpec gFileSpec _) = do popMode <- envInputPlinkMode - liftIO $ checkFile (d indFile gd) Nothing - case format gd of - GenotypeFormatEigenstrat -> readEigenstratInd (d indFile gd) - GenotypeFormatPlink -> map (plinkFam2EigenstratInd popMode) <$> readFamFile (d indFile gd) + case gFileSpec of + GenotypeEigenstrat _ _ _ _ fn fnChk -> do + liftIO $ checkFile (d fn) fnChk + readEigenstratInd (d fn) + GenotypePlink _ _ _ _ fn fnChk -> do + liftIO $ checkFile (d fn) fnChk + map (plinkFam2EigenstratInd popMode) <$> readFamFile (d fn) + GenotypeVCF fn fnChk -> do + liftIO $ checkFile (d fn) fnChk + (VCFheader _ sampleNames , _) <- liftIO . runSafeT . readVCFfromFile $ (d fn) + --neither Sex nor population name is part of a VCF file, so we fill dummy values: + return [EigenstratIndEntry s Unknown "unknown" | s <- sampleNames] -- | A function to read the genotype data of a package loadGenotypeData :: (MonadSafe m) => FilePath -- ^ the base path -> GenotypeDataSpec -- ^ the genotype spec - -> PlinkPopNameMode -- ^ The Plink PopName Mode - -> m ([EigenstratIndEntry], Producer (EigenstratSnpEntry, GenoLine) m ()) - -- ^ a pair of the EigenstratIndEntries and a Producer over the Snp position values and the genotype line. -loadGenotypeData baseDir (GenotypeDataSpec format_ genoF _ snpF _ indF _ _) popMode = - case format_ of - GenotypeFormatEigenstrat -> readEigenstrat (baseDir genoF) (baseDir snpF) (baseDir indF) - GenotypeFormatPlink -> do - (famEntries, prod) <- readPlink (baseDir genoF) (baseDir snpF) (baseDir indF) - return (map (plinkFam2EigenstratInd popMode) famEntries, prod) + -> m (Producer (EigenstratSnpEntry, GenoLine) m ()) + -- ^ a Producer over the Snp position values and the genotype line. +loadGenotypeData baseDir (GenotypeDataSpec gFileSpec _) = + case gFileSpec of + GenotypeEigenstrat genoF _ snpF _ indF _ -> snd <$> readEigenstrat (baseDir genoF) (baseDir snpF) (baseDir indF) + GenotypePlink genoF _ snpF _ indF _ -> snd <$> readPlink (baseDir genoF) (baseDir snpF) (baseDir indF) + GenotypeVCF fn _ -> do + vcfProd <- snd <$> readVCFfromFile (baseDir fn) + return $ vcfProd >-> vcf2eigenstratPipe + +vcf2eigenstratPipe :: (MonadIO m) => Pipe VCFentry (EigenstratSnpEntry, GenoLine) m r +vcf2eigenstratPipe = for cat $ \vcfEntry -> do + case vcfToFreqSumEntry vcfEntry of --freqSum is a useful intermediate format. This function already does a bunch of checks of the VCF data. + Right (FreqSumEntry chrom pos snpId_ geneticPos ref alt alleleCounts) -> do + let eigenstratSnpEntry = EigenstratSnpEntry chrom pos (maybe 0.0 id geneticPos) (maybe "" id snpId_) ref alt + genoLine <- V.fromList <$> forM alleleCounts (\alleleCount -> do + case alleleCount of + Nothing -> return Missing + Just 0 -> return HomRef + Just 1 -> return Het + Just 2 -> return HomAlt + _ -> liftIO . throwIO . PoseidonGenotypeException $ + "illegal dosage (" ++ show alleleCount ++ ") in VCF file at chrom " ++ show chrom ++ ", position " ++ show pos) + yield (eigenstratSnpEntry, genoLine) + Left err -> liftIO . throwIO . PoseidonGenotypeException $ err joinEntries :: (MonadIO m) => LogA -> [Int] -> [String] -> [Maybe (EigenstratSnpEntry, GenoLine)] -> m (EigenstratSnpEntry, GenoLine) joinEntries logA nrInds pacNames maybeTupleList = do diff --git a/src/Poseidon/Janno.hs b/src/Poseidon/Janno.hs index c1c80104..8f6057fd 100644 --- a/src/Poseidon/Janno.hs +++ b/src/Poseidon/Janno.hs @@ -41,6 +41,7 @@ module Poseidon.Janno ( parseCsvParseError, renderCsvParseError, getMaybeListColumn, + jannoRows2EigenstratIndEntries ) where import Poseidon.ColumnTypes @@ -97,7 +98,7 @@ getCsvNR :: CsvNamedRecord -> Csv.NamedRecord getCsvNR (CsvNamedRecord x) = x -- | A data type to represent a janno file -newtype JannoRows = JannoRows [JannoRow] +newtype JannoRows = JannoRows {getJannoRows :: [JannoRow]} deriving (Show, Eq, Generic) instance Semigroup JannoRows where @@ -649,3 +650,10 @@ checkRelationColsConsistent x = -- deriving with TemplateHaskell necessary for the generics magic in the Survey module deriveGeneric ''JannoRow + +-- | a convenience function to construct Eigenstrat Ind entries out of jannoRows +jannoRows2EigenstratIndEntries :: JannoRows -> [EigenstratIndEntry] +jannoRows2EigenstratIndEntries (JannoRows jannoRows) = do -- list monad + jannoRow <- jannoRows -- looping over jannoRows + let GroupName gText = head . getListColumn . jGroupName $ jannoRow + return $ EigenstratIndEntry (jPoseidonID jannoRow) (sfSex (jGeneticSex jannoRow)) (T.unpack gText) diff --git a/src/Poseidon/Package.hs b/src/Poseidon/Package.hs index 1717dfa2..d635b269 100644 --- a/src/Poseidon/Package.hs +++ b/src/Poseidon/Package.hs @@ -40,9 +40,11 @@ import Poseidon.EntityTypes (EntitySpec, HasNameAndVersion (..), isLatestInCollection, makePacNameAndVersion, renderNameWithVersion) -import Poseidon.GenotypeData (GenotypeDataSpec (..), joinEntries, +import Poseidon.GenotypeData (GenotypeDataSpec (..), + GenotypeFileSpec (..), joinEntries, loadGenotypeData, loadIndividuals, - printSNPCopyProgress) + printSNPCopyProgress, + reduceGenotypeFilepaths) import Poseidon.Janno (GeneticSex (..), JannoLibraryBuilt (..), JannoRow (..), JannoRows (..), @@ -62,9 +64,9 @@ import Poseidon.ServerClient (AddJannoColSpec (..), GroupInfo (..), PackageInfo (..)) import Poseidon.Utils (LogA, PoseidonException (..), PoseidonIO, checkFile, - envErrorLength, envInputPlinkMode, - envLogAction, logDebug, logError, - logInfo, logWarning, logWithEnv, + envErrorLength, envLogAction, + logDebug, logError, logInfo, + logWarning, logWithEnv, renderPoseidonException) import Control.DeepSeq (($!!)) @@ -105,7 +107,7 @@ import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), EigenstratSnpEntry (..), GenoEntry (..), GenoLine, readEigenstratSnpFile) -import SequenceFormats.Plink (PlinkPopNameMode (..), readBimFile) +import SequenceFormats.Plink (readBimFile) import System.Directory (doesDirectoryExist, listDirectory) import System.FilePath (takeBaseName, takeDirectory, takeExtension, takeFileName, ()) @@ -285,7 +287,7 @@ readPoseidonPackageCollectionWithSkipIndicator opts baseDirs = do logInfo "Initializing packages... " eitherPackages <- mapM tryDecodePoseidonPackage $ zip [1..] posFiles -- notifying the users of package problems - skipIndicator <- if (null . lefts $ eitherPackages) then return False else do + skipIndicator <- if null . lefts $ eitherPackages then return False else do logWarning "Some packages were skipped due to issues:" forM_ (zip posFiles eitherPackages) $ \(posF, epac) -> do case epac of @@ -376,12 +378,16 @@ readPoseidonPackage opts ymlPath = do -- read janno (or fill with empty dummy object) indEntries <- loadIndividuals baseDir geno + let (checkSex, checkGroups) = case genotypeFileSpec geno of + GenotypeVCF _ _ -> (False, False) + _ -> (True, True) + janno <- case poseidonJannoFilePath baseDir yml of Nothing -> do return $ createMinimalJanno indEntries Just p -> do loadedJanno <- readJannoFile p - liftIO $ checkJannoIndConsistency tit loadedJanno indEntries + liftIO $ checkJannoIndConsistency tit loadedJanno indEntries checkSex checkGroups return loadedJanno -- read seqSource @@ -418,7 +424,6 @@ checkYML yml = do validateGeno :: PoseidonPackage -> Bool -> PoseidonIO () validateGeno pac checkFullGeno = do logA <- envLogAction - plinkMode <- envInputPlinkMode errLength <- envErrorLength --let jannoRows = getJannoRowsFromPac pac --let ploidyList = map jGenotypePloidy jannoRows @@ -427,7 +432,7 @@ validateGeno pac checkFullGeno = do runSafeT $ do -- we're using getJointGenotypeData here on a single package to check for SNP consistency -- since that check is only implemented in the jointLoading function, not in the per-package loading - (_, eigenstratProd) <- getJointGenotypeData logA False plinkMode [pac] Nothing + eigenstratProd <- getJointGenotypeData logA False [pac] Nothing -- check all or only the first 100 SNPs if checkFullGeno then do @@ -483,18 +488,36 @@ checkFiles baseDir ignoreChecksums ignoreGenotypeFilesMissing yml = do unless ignoreGenotypeFilesMissing $ do let gd = _posYamlGenotypeData yml d = baseDir - if ignoreChecksums - then do - checkFile (d genoFile gd) Nothing - checkFile (d snpFile gd) Nothing - checkFile (d indFile gd) Nothing - else do - checkFile (d genoFile gd) $ genoFileChkSum gd - checkFile (d snpFile gd) $ snpFileChkSum gd - checkFile (d indFile gd) $ indFileChkSum gd - -checkJannoIndConsistency :: String -> JannoRows -> [EigenstratIndEntry] -> IO () -checkJannoIndConsistency pacName (JannoRows rows) indEntries = do + case genotypeFileSpec gd of + GenotypeEigenstrat genoF genoFc snpF snpFc indF indFc -> do + if ignoreChecksums + then do + checkFile (d genoF) Nothing + checkFile (d snpF) Nothing + checkFile (d indF) Nothing + else do + checkFile (d genoF) genoFc + checkFile (d snpF) snpFc + checkFile (d indF) indFc + GenotypePlink genoF genoFc snpF snpFc indF indFc -> do + if ignoreChecksums + then do + checkFile (d genoF) Nothing + checkFile (d snpF) Nothing + checkFile (d indF) Nothing + else do + checkFile (d genoF) genoFc + checkFile (d snpF) snpFc + checkFile (d indF) indFc + GenotypeVCF genoF genoFc -> do + if ignoreChecksums + then checkFile (d genoF) Nothing + else checkFile (d genoF) genoFc + +-- the final two flags are important for reading VCFs, which lack group and sex information. So +-- we want to skip these checks in this case, see client code in readPoseidonPackage +checkJannoIndConsistency :: String -> JannoRows -> [EigenstratIndEntry] -> Bool -> Bool -> IO () +checkJannoIndConsistency pacName (JannoRows rows) indEntries checkGroups checkSex = do let genoIDs = [ x | EigenstratIndEntry x _ _ <- indEntries] genoSexs = [ x | EigenstratIndEntry _ x _ <- indEntries] genoGroups = [ x | EigenstratIndEntry _ _ x <- indEntries] @@ -507,10 +530,10 @@ checkJannoIndConsistency pacName (JannoRows rows) indEntries = do when idMis $ throwM $ PoseidonCrossFileConsistencyException pacName $ "Individual ID mismatch between genotype data (left) and .janno files (right): " ++ renderMismatch genoIDs jannoIDs - when sexMis $ throwM $ PoseidonCrossFileConsistencyException pacName $ + when (sexMis && checkSex) $ throwM $ PoseidonCrossFileConsistencyException pacName $ "Individual Sex mismatch between genotype data (left) and .janno files (right): " ++ renderMismatch (map show genoSexs) (map show jannoSexs) - when groupMis $ throwM $ PoseidonCrossFileConsistencyException pacName $ + when (groupMis && checkGroups) $ throwM $ PoseidonCrossFileConsistencyException pacName $ "Individual GroupID mismatch between genotype data (left) and .janno files (right). Note \ \that this could be due to a wrong Plink file population-name encoding \ \(see the --inPlinkPopName option). " ++ @@ -598,18 +621,15 @@ findAllPoseidonYmlFiles baseDir = do getJointGenotypeData :: MonadSafe m => LogA -- ^ how messages should be logged -> Bool -- ^ whether to generate an intersection instead of union of input sites - -> PlinkPopNameMode -- ^ how to read population labels from Plink -> [PoseidonPackage] -- ^ A list of poseidon packages. -> Maybe FilePath -- ^ a genotype file to select SNPs from - -> m ([EigenstratIndEntry], Producer (EigenstratSnpEntry, GenoLine) m ()) + -> m (Producer (EigenstratSnpEntry, GenoLine) m ()) -- ^ a pair of the EigenstratIndEntries and a Producer over the Snp position values and the genotype line, joined across all packages. -getJointGenotypeData logA intersect popMode pacs maybeSnpFile = do - genotypeTuples <- sequence [loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac) popMode | pac <- pacs] - let indEntries = map fst genotypeTuples - jointIndEntries = concat indEntries - nrInds = map length indEntries +getJointGenotypeData logA intersect pacs maybeSnpFile = do + genotypeProducers <- sequence [loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac) | pac <- pacs] + let nrInds = map (length . getJannoRows . posPacJanno) pacs pacNames = map getPacName pacs - prod = (orderedZipAll compFunc . map snd) genotypeTuples >-> + prod = orderedZipAll compFunc genotypeProducers >-> P.filter filterUnionOrIntersection >-> joinEntryPipe logA nrInds pacNames jointProducer <- case maybeSnpFile of Nothing -> do @@ -617,7 +637,7 @@ getJointGenotypeData logA intersect popMode pacs maybeSnpFile = do Just fn -> do let snpProd = loadBimOrSnpFile fn >-> orderCheckPipe compFunc3 return $ (orderedZip compFunc2 snpProd prod >> return [()]) >-> selectSnps (sum nrInds) - return (jointIndEntries, void jointProducer) + return (void jointProducer) -- the void here just replaces a list of return values [(), (), ()] from the orderedZip to a single () where compFunc :: (EigenstratSnpEntry, GenoLine) -> (EigenstratSnpEntry, GenoLine) -> Ordering compFunc (EigenstratSnpEntry c1 p1 _ _ _ _, _) (EigenstratSnpEntry c2 p2 _ _ _ _, _) = compare (c1, p1) (c2, p2) @@ -657,16 +677,17 @@ loadBimOrSnpFile fn | otherwise = throwM (PoseidonGenotypeException "option snpFile requires file endings to be *.snp or *.bim or *.snp.gz or *.bim.gz") -- | A function to create a minimal POSEIDON package -newMinimalPackageTemplate :: FilePath -> String -> GenotypeDataSpec -> PoseidonPackage -newMinimalPackageTemplate baseDir name (GenotypeDataSpec format_ geno _ snp _ ind _ snpSet_) = - PoseidonPackage { +newMinimalPackageTemplate :: (MonadThrow m) => FilePath -> String -> GenotypeDataSpec -> m PoseidonPackage +newMinimalPackageTemplate baseDir name gd = do + reducedGD <- snd <$> reduceGenotypeFilepaths gd + return $ PoseidonPackage { posPacBaseDir = baseDir , posPacPoseidonVersion = asVersion latestPoseidonVersion , posPacNameAndVersion = PacNameAndVersion name Nothing , posPacDescription = Nothing , posPacContributor = [] , posPacLastModified = Nothing - , posPacGenotypeData = GenotypeDataSpec format_ (takeFileName geno) Nothing (takeFileName snp) Nothing (takeFileName ind) Nothing snpSet_ + , posPacGenotypeData = reducedGD , posPacJannoFile = Nothing , posPacJanno = mempty , posPacJannoFileChkSum = Nothing @@ -681,24 +702,14 @@ newMinimalPackageTemplate baseDir name (GenotypeDataSpec format_ geno _ snp _ in } makePseudoPackageFromGenotypeData :: GenotypeDataSpec -> PoseidonIO PoseidonPackage -makePseudoPackageFromGenotypeData (GenotypeDataSpec format_ genoFile_ _ snpFile_ _ indFile_ _ snpSet_) = do - let baseDir = getBaseDir genoFile_ snpFile_ indFile_ - outInd = takeFileName indFile_ - outSnp = takeFileName snpFile_ - outGeno = takeFileName genoFile_ - genotypeData = GenotypeDataSpec format_ outGeno Nothing outSnp Nothing outInd Nothing snpSet_ - pacName = takeBaseName genoFile_ - inds <- loadIndividuals baseDir genotypeData - newPackageTemplate baseDir pacName genotypeData (Just (Left inds)) mempty [] - where - getBaseDir :: FilePath -> FilePath -> FilePath -> FilePath - getBaseDir g s i = - let baseDirGeno = takeDirectory genoFile_ - baseDirSnp = takeDirectory snpFile_ - baseDirInd = takeDirectory indFile_ - in if baseDirGeno == baseDirSnp && baseDirSnp == baseDirInd - then baseDirGeno - else throwM $ PoseidonUnequalBaseDirException g s i +makePseudoPackageFromGenotypeData gd = do + (baseDir, reducedGenotypeDataSpec) <- reduceGenotypeFilepaths gd + let pacName = case genotypeFileSpec reducedGenotypeDataSpec of + GenotypeEigenstrat fn _ _ _ _ _ -> takeBaseName fn + GenotypePlink fn _ _ _ _ _ -> takeBaseName fn + GenotypeVCF fn _ -> takeBaseName fn + inds <- loadIndividuals baseDir reducedGenotypeDataSpec + newPackageTemplate baseDir pacName reducedGenotypeDataSpec (Just (Left inds)) mempty [] -- | A function to create a more complete POSEIDON package -- This will take only the filenames of the provided files, so it assumes that the files will be copied into @@ -713,8 +724,8 @@ newPackageTemplate :: -> PoseidonIO PoseidonPackage newPackageTemplate baseDir name genoData indsOrJanno seqSource bib = do (UTCTime today _) <- liftIO getCurrentTime - let minimalTemplate = newMinimalPackageTemplate baseDir name genoData - fluffedUpTemplate = minimalTemplate { + minimalTemplate <- newMinimalPackageTemplate baseDir name genoData + let fluffedUpTemplate = minimalTemplate { posPacDescription = Just "Empty package template. Please add a description" , posPacContributor = [] , posPacNameAndVersion = PacNameAndVersion name (Just $ makeVersion [0, 1, 0]) diff --git a/test/Poseidon/InterfaceSpec.hs b/test/Poseidon/InterfaceSpec.hs index 2cece88e..a493b213 100644 --- a/test/Poseidon/InterfaceSpec.hs +++ b/test/Poseidon/InterfaceSpec.hs @@ -1,7 +1,7 @@ module Poseidon.InterfaceSpec (spec) where import Poseidon.CLI.OptparseApplicativeParsers -import Poseidon.GenotypeData (GenotypeFormatSpec (..)) +import Poseidon.GenotypeData (GenotypeFileSpec (..)) import qualified Options.Applicative as OP import Test.Hspec @@ -18,7 +18,13 @@ testParseInGenoOne = describe "Poseidon.OptparseApplicativeParsers.parseInGenoOne" $ do it "should detect zipped files correctly" $ do let maybeValEigenstrat = runParser parseInGenoOne ["-p", "path/to/file.geno.gz"] - maybeValEigenstrat `shouldBe` Just (GenotypeFormatEigenstrat, "path/to/file.geno.gz", "path/to/file.snp.gz", "path/to/file.ind") + maybeValEigenstrat `shouldBe` + Just (GenotypeEigenstrat "path/to/file.geno.gz" Nothing + "path/to/file.snp.gz" Nothing + "path/to/file.ind" Nothing) let maybeValPlink = runParser parseInGenoOne ["-p", "path/to/file.bim.gz"] - maybeValPlink `shouldBe` Just (GenotypeFormatPlink, "path/to/file.bed.gz", "path/to/file.bim.gz", "path/to/file.fam") + maybeValPlink `shouldBe` + Just (GenotypePlink "path/to/file.bed.gz" Nothing + "path/to/file.bim.gz" Nothing + "path/to/file.fam" Nothing) diff --git a/test/Poseidon/PackageSpec.hs b/test/Poseidon/PackageSpec.hs index 3660a03f..174c26b5 100644 --- a/test/Poseidon/PackageSpec.hs +++ b/test/Poseidon/PackageSpec.hs @@ -5,7 +5,7 @@ module Poseidon.PackageSpec (spec) where import Poseidon.Contributor (ContributorSpec (..), ORCID (..)) import Poseidon.EntityTypes (HasNameAndVersion (..)) import Poseidon.GenotypeData (GenotypeDataSpec (..), - GenotypeFormatSpec (..), + GenotypeFileSpec (..), SNPSetSpec (..)) import Poseidon.Package (PackageReadOptions (..), PoseidonPackage (..), @@ -47,6 +47,7 @@ spec = do testZipWithPadding testGetJointGenotypeData testGetJointGzippedGenotypeData + testGetVCFdata testThrowOnRead testPacReadOpts :: PackageReadOptions @@ -96,14 +97,15 @@ truePackageRelPaths = PoseidonYamlStruct { _posYamlPackageVersion = Just $ makeVersion [1, 0, 0], _posYamlLastModified = Just $ fromGregorian 2020 2 28, _posYamlGenotypeData = GenotypeDataSpec { - format = GenotypeFormatPlink, - genoFile = "Schiffels_2016.bed", - genoFileChkSum = Nothing, - snpFile = "Schiffels_2016.bim", - snpFileChkSum = Nothing, - indFile = "Schiffels_2016.fam", - indFileChkSum = Nothing, - snpSet = Just SNPSet1240K + genotypeFileSpec = GenotypePlink { + _plGenoFile = "Schiffels_2016.bed", + _plGenoFileChkSum = Nothing, + _plSnpFile = "Schiffels_2016.bim", + _plSnpFileChkSum = Nothing, + _plIndFile = "Schiffels_2016.fam", + _plIndFileChkSum = Nothing + }, + genotypeSnpSet = Just SNPSet1240K }, _posYamlJannoFile = Just "Schiffels_2016.janno", _posYamlJannoFileChkSum = Nothing, @@ -153,7 +155,7 @@ testPoseidonFromYAML = describe "PoseidonPackage.fromYAML" $ do p_ = fromRight dummyPackageYamlStruct $ decodeTest yamlPackage2 gd = _posYamlGenotypeData p_ gdTrue = _posYamlGenotypeData truePackageRelPaths - gd `shouldBe` gdTrue {snpSet = Nothing} + gd `shouldBe` gdTrue {genotypeSnpSet = Nothing} it "should parse missing contributor field as empty list" $ do let yamlPackage2 = replace "contributor:\n - name: Stephan Schiffels\n email: schiffels@institute.org\n orcid: 0000-0002-1017-9150" "" yamlPackage @@ -176,14 +178,15 @@ testPoseidonFromYAML = describe "PoseidonPackage.fromYAML" $ do _posYamlPackageVersion = Nothing, _posYamlLastModified = Nothing, _posYamlGenotypeData = GenotypeDataSpec { - format = GenotypeFormatPlink, - genoFile = "test.bed", - genoFileChkSum = Nothing, - snpFile = "test.bim", - snpFileChkSum = Nothing, - indFile = "test.fam", - indFileChkSum = Nothing, - snpSet = Nothing + genotypeFileSpec = GenotypePlink { + _plGenoFile = "test.bed", + _plGenoFileChkSum = Nothing, + _plSnpFile = "test.bim", + _plSnpFileChkSum = Nothing, + _plIndFile = "test.fam", + _plIndFileChkSum = Nothing + }, + genotypeSnpSet = Nothing }, _posYamlJannoFile = Nothing, _posYamlJannoFileChkSum = Nothing, @@ -277,7 +280,7 @@ testGetJointGenotypeData = describe "Poseidon.Package.getJointGenotypeData" $ do it "should correctly load genotype data without intersect" $ do pacs <- testLog $ mapM (readPoseidonPackage testPacReadOpts) pacFiles jointDat <- runSafeT $ do - (_, jointProd) <- getJointGenotypeData noLog False PlinkPopNameAsFamily pacs Nothing + jointProd <- getJointGenotypeData noLog False pacs Nothing P.toListM jointProd length jointDat `shouldBe` 10 jointDat !! 3 `shouldBe` (EigenstratSnpEntry (Chrom "1") 903426 0.024457 "1_903426" 'C' 'T', @@ -287,7 +290,7 @@ testGetJointGenotypeData = describe "Poseidon.Package.getJointGenotypeData" $ do it "should correctly load genotype data with intersect" $ do pacs <- testLog $ mapM (readPoseidonPackage testPacReadOpts) pacFiles jointDat <- runSafeT $ do - (_, jointProd) <- getJointGenotypeData noLog True PlinkPopNameAsFamily pacs Nothing + jointProd <- getJointGenotypeData noLog True pacs Nothing P.toListM jointProd length jointDat `shouldBe` 8 jointDat !! 3 `shouldBe` (EigenstratSnpEntry (Chrom "1") 949654 0.025727 "1_949654" 'A' 'G', @@ -297,19 +300,19 @@ testGetJointGenotypeData = describe "Poseidon.Package.getJointGenotypeData" $ do it "should correctly load the right nr of SNPs with snpFile and no intersect" $ do pacs <- testLog $ mapM (readPoseidonPackage testPacReadOpts) pacFiles jointDat <- runSafeT $ do - (_, jointProd) <- getJointGenotypeData noLog False PlinkPopNameAsFamily pacs (Just "test/testDat/snpFile.snp") + jointProd <- getJointGenotypeData noLog False pacs (Just "test/testDat/snpFile.snp") P.toListM jointProd length jointDat `shouldBe` 6 it "should correctly load the right nr of SNPs with snpFile and intersect" $ do pacs <- testLog $ mapM (readPoseidonPackage testPacReadOpts) pacFiles jointDat <- runSafeT $ do - (_, jointProd) <- getJointGenotypeData noLog True PlinkPopNameAsFamily pacs (Just "test/testDat/snpFile.snp") + jointProd <- getJointGenotypeData noLog True pacs (Just "test/testDat/snpFile.snp") P.toListM jointProd length jointDat `shouldBe` 4 it "should fail with unordered SNP input file" $ do pacs <- testLog $ mapM (readPoseidonPackage testPacReadOpts) pacFiles let makeJointDat = runSafeT $ do - (_, jointProd) <- getJointGenotypeData noLog False PlinkPopNameAsFamily pacs (Just "test/testDat/snpFile_unordered.snp") + jointProd <- getJointGenotypeData noLog False pacs (Just "test/testDat/snpFile_unordered.snp") P.toListM jointProd makeJointDat `shouldThrow` isInputOrderException it "should skip incongruent alleles" $ do @@ -317,7 +320,7 @@ testGetJointGenotypeData = describe "Poseidon.Package.getJointGenotypeData" $ do "test/testDat/testPackages/test_incongruent_snps/POSEIDON.yml"] pacs <- testLog $ mapM (readPoseidonPackage testPacReadOpts) pacFiles2 jointDat <- runSafeT $ do - (_, jointProd) <- getJointGenotypeData noLog False PlinkPopNameAsFamily pacs Nothing + jointProd <- getJointGenotypeData noLog False pacs Nothing P.toListM jointProd length jointDat `shouldBe` 7 where @@ -326,12 +329,12 @@ testGetJointGenotypeData = describe "Poseidon.Package.getJointGenotypeData" $ do testGetJointGzippedGenotypeData :: Spec testGetJointGzippedGenotypeData = describe "Poseidon.Package.getJointGenotypeData" $ do - let pacFiles = ["test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON_gzipped.yml", + let pacFiles = ["test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/POSEIDON.yml", "test/testDat/testPackages/ancient/Schiffels_2016/POSEIDON.yml"] it "should correctly load gzipped and non-gzipped genotype data without intersect" $ do pacs <- testLog $ mapM (readPoseidonPackage testPacReadOpts) pacFiles jointDat <- runSafeT $ do - (_, jointProd) <- getJointGenotypeData noLog False PlinkPopNameAsFamily pacs Nothing + jointProd <- getJointGenotypeData noLog False pacs Nothing P.toListM jointProd length jointDat `shouldBe` 10 jointDat !! 3 `shouldBe` (EigenstratSnpEntry (Chrom "1") 903426 0.024457 "1_903426" 'C' 'T', @@ -339,11 +342,26 @@ testGetJointGzippedGenotypeData = describe "Poseidon.Package.getJointGenotypeDat jointDat !! 5 `shouldBe` (EigenstratSnpEntry (Chrom "2") 1018704 0.026288 "2_1018704" 'A' 'G', V.fromList $ replicate 10 Missing ++ [Het, Het, HomRef, Het, Missing, HomAlt, Het, HomRef, HomAlt, Het]) +testGetVCFdata :: Spec +testGetVCFdata = describe "Poseidon.Package.getJointGenotypeData" $ do + let pacFiles = ["test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON.yml", + "test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/POSEIDON.yml"] + it "should correctly load VCF and Eigenstrat genotype data" $ do + pacs <- testLog $ mapM (readPoseidonPackage testPacReadOpts) pacFiles + jointDat <- runSafeT $ do + jointProd <- getJointGenotypeData noLog False pacs Nothing + P.toListM jointProd + length jointDat `shouldBe` 10 + jointDat !! 3 `shouldBe` (EigenstratSnpEntry (Chrom "1") 903426 0.024457 "1_903426" 'C' 'T', + V.fromList $ [Het, Het, HomAlt, Het, HomRef, HomRef, Het, HomRef, HomRef, HomAlt] ++ replicate 10 Missing) + jointDat !! 5 `shouldBe` (EigenstratSnpEntry (Chrom "2") 1018704 0.0 "2_1018704" 'A' 'G', + V.fromList $ replicate 10 Missing ++ [Het, Het, HomRef, Het, Missing, HomAlt, Het, HomRef, HomAlt, Het]) + testThrowOnRead :: Spec testThrowOnRead = describe "Poseidon.Package.readPoseidonPackage" $ do it "should throw if bibentries aren't found" $ do let opts = defaultPackageReadOptions {_readOptGenoCheck = False} - let ymlPath = "test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON_nobib.yml" + let ymlPath = "test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/POSEIDON.yml" testLog (readPoseidonPackage opts ymlPath) `shouldThrow` isPoseidonCrossFileConsistencyException it "should throw if Plink Setting is not correct" $ do let opts = defaultPackageReadOptions @@ -351,7 +369,7 @@ testThrowOnRead = describe "Poseidon.Package.readPoseidonPackage" $ do usePoseidonLogger NoLog Testing PlinkPopNameAsPhenotype CharInf (readPoseidonPackage opts ymlPath) `shouldThrow` isPoseidonCrossFileConsistencyException it "should not throw if Plink Setting is correct" $ do let opts = defaultPackageReadOptions - let ymlPath = "test/testDat/testPackages/ancient/Wang_2020/POSEIDON_otherPlinkEncoding.yml" + let ymlPath = "test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/POSEIDON.yml" _ <- usePoseidonLogger NoLog Testing PlinkPopNameAsPhenotype CharInf (readPoseidonPackage opts ymlPath) return () where diff --git a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt index eb6c3ede..923fd0b6 100644 --- a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt +++ b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt @@ -7,6 +7,10 @@ fd632717ecaf337a39cfd7a828a54e99 init init/Schiffels/Schiffels.janno 9edc4a757f785a8ecb59c54d16c5690a init init/Schiffels/Schiffels.bib c35421d9be15aa66fa3a3c46df1f746c init init/Wang/POSEIDON.yml ae66d851301f4a761b819f97ec28fa55 init init/Wang/Wang_2020.bed +956c7bf4c6999cc322ad8407d8bef776 init init_vcf/Schiffels_vcf/POSEIDON.yml +02386e2064f92c68d4be8fea2e2f923c init init_vcf/Schiffels_vcf/Schiffels.janno +b60795356b27b5e35002e44fcb24b0a7 init init_vcf/Schiffels_vcf/geno.vcf +9edc4a757f785a8ecb59c54d16c5690a init init_vcf/Schiffels_vcf/Schiffels.bib d41d8cd98f00b204e9800998ecf8427e validate validate/validate1 d41d8cd98f00b204e9800998ecf8427e validate validate/validate2 d41d8cd98f00b204e9800998ecf8427e validate validate/validate3 @@ -39,17 +43,25 @@ b46831b007c2d53a12b472484b7b00b4 genoconvert init/Wang/Wang.snp 3c38f40efe215a047c02f4e98e0390da genoconvert init/Schiffels/geno.bed 8538ffd971ebb12cf5ef6e338da27970 genoconvert init/Schiffels/geno.bim a9fcb59cb933d8f183f3c3f6fbf2e213 genoconvert init/Schiffels/geno.fam +70941762e1a2cb0852af797c70fe0a77 genoconvert init_vcf/Schiffels_vcf/geno.bed +d8d1e5dd679ae256c40340e726a0cdc5 genoconvert init_vcf/Schiffels_vcf/geno.bim +9d880883cfc0783508fdf1ac980711a3 genoconvert init_vcf/Schiffels_vcf/geno.fam 72c7a584feefddb31f817c205355dd14 rectify init/Schiffels/POSEIDON.yml 4aeae97cfc44b55b2a8005425d786148 rectify init/Schiffels/CHANGELOG.md -925e402351afd974403402d141abe342 rectify init/Schiffels/POSEIDON.yml +da981f8d52f60ec4d96865b224648c92 rectify init/Schiffels/POSEIDON.yml 3bb396e099d5b8771a3409f5fe85d70b rectify init/Schiffels/CHANGELOG.md -b9216f365108c5c5c66f78cceb2eb09a rectify init/Schiffels/POSEIDON.yml +dc322649188ce2995cea8a46a7f97f3e rectify init/Schiffels/POSEIDON.yml 3bb396e099d5b8771a3409f5fe85d70b rectify init/Schiffels/CHANGELOG.md 2757f727e02dd6453fffe68c4c6ec4c8 forge forge/ForgePac1/POSEIDON.yml 1286a2580e4bfbed7d804d5f3fe125f7 forge forge/ForgePac1/ForgePac1.geno 8846333d9a1de6510f25a3816cc70fef forge forge/ForgePac1/ForgePac1.janno 9089f5d5602937bb7713e1dc8d7a8f2d forge forge/ForgePac1/ForgePac1.ssf b4f71aff4fbc11594008c3811781cc43 forge forge/ForgePac1/ForgePac1.bib +15ea8dd8b98172e3554732ceb2adfe09 forge forge/ForgePac1_vcf/POSEIDON.yml +9e0c33410f399caf99446f01a7ed7809 forge forge/ForgePac1_vcf/ForgePac1_vcf.geno +8846333d9a1de6510f25a3816cc70fef forge forge/ForgePac1_vcf/ForgePac1_vcf.janno +9089f5d5602937bb7713e1dc8d7a8f2d forge forge/ForgePac1_vcf/ForgePac1_vcf.ssf +b4f71aff4fbc11594008c3811781cc43 forge forge/ForgePac1_vcf/ForgePac1_vcf.bib 47485dc1f997a30c61cd2f72fe259013 forge forge/ForgePac2/POSEIDON.yml 6010163f73dc9cf5185933fc7a0333df forge forge/ForgePac2/ForgePac2.bed 0542b6a5a04a74237f1c1d02783c87e1 forge forge/ForgePac3/POSEIDON.yml @@ -108,7 +120,7 @@ b7b649620cd37bd4a6d6f0f31c1c56da forge forge/ForgePac19/ForgePac19.janno d4a05cfef045648238a94a9d621cf667 chronicle chronicle/chronicle1.yml b43da4d5734371c0648553120f812466 timetravel timetravel/Lamnidis_2018-1.0.0/POSEIDON.yml 8d57ce1a1ab28c0d8a5f391dd790a59c timetravel timetravel/Lamnidis_2018-1.0.1/POSEIDON.yml -b9216f365108c5c5c66f78cceb2eb09a timetravel timetravel/Schiffels-1.1.1/POSEIDON.yml +dc322649188ce2995cea8a46a7f97f3e timetravel timetravel/Schiffels-1.1.1/POSEIDON.yml 1ab24c45ef3a13e0fb34afac7a21dca8 timetravel timetravel/Schmid_2028-1.0.0/POSEIDON.yml 8d57ce1a1ab28c0d8a5f391dd790a59c fetch fetch/by_package/Lamnidis_2018-1.0.1/POSEIDON.yml 1ab24c45ef3a13e0fb34afac7a21dca8 fetch fetch/by_package/Schmid_2028-1.0.0/POSEIDON.yml diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Schiffels/POSEIDON.yml b/test/PoseidonGoldenTests/GoldenTestData/chronicle/Schiffels/POSEIDON.yml index ecec7011..1feb73d5 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Schiffels/POSEIDON.yml +++ b/test/PoseidonGoldenTests/GoldenTestData/chronicle/Schiffels/POSEIDON.yml @@ -12,11 +12,8 @@ lastModified: 1970-01-01 genotypeData: format: EIGENSTRAT genoFile: geno.txt - genoFileChkSum: 0332344057c0c4dce2ff7176f8e1103d snpFile: snp.txt - snpFileChkSum: d76e3e7a8fc0f1f5e435395424b5aeab indFile: ind.txt - indFileChkSum: f77dc756666dbfef3bb35191ae15a167 snpSet: Other jannoFile: Schiffels.janno jannoFileChkSum: fd632717ecaf337a39cfd7a828a54e99 diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml index 3406ef0c..f4402da5 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml +++ b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml @@ -1,29 +1,29 @@ title: Chronicle title description: Chronicle description chronicleVersion: 0.2.0 -lastModified: 2024-09-03 +lastModified: 2024-10-25 packages: - title: Lamnidis_2018 version: 1.0.0 - commit: 5b1a73d2d20f7ee1cebcf4c20b957e6ea65287c8 + commit: e20fd40ca6aae8aafe97b3e60aa63a69045de227 path: Lamnidis_2018 - title: Lamnidis_2018 version: 1.0.1 - commit: 5b1a73d2d20f7ee1cebcf4c20b957e6ea65287c8 + commit: e20fd40ca6aae8aafe97b3e60aa63a69045de227 path: Lamnidis_2018_newVersion - title: Schiffels version: 1.1.1 - commit: 2fa3e84429bc0b9f9283102f1956b396c7320419 + commit: b5ecaa05ecbdfa85bd8fa3646e1680e0c88a4020 path: Schiffels - title: Schiffels_2016 version: 1.0.1 - commit: 5b1a73d2d20f7ee1cebcf4c20b957e6ea65287c8 + commit: e20fd40ca6aae8aafe97b3e60aa63a69045de227 path: Schiffels_2016 - title: Schmid_2028 version: 1.0.0 - commit: 5b1a73d2d20f7ee1cebcf4c20b957e6ea65287c8 + commit: e20fd40ca6aae8aafe97b3e60aa63a69045de227 path: Schmid_2028 - title: Wang_2020 version: 0.1.0 - commit: 5b1a73d2d20f7ee1cebcf4c20b957e6ea65287c8 + commit: e20fd40ca6aae8aafe97b3e60aa63a69045de227 path: Wang_2020 diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.bib b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.bib new file mode 100644 index 00000000..c3cd3ae0 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.bib @@ -0,0 +1,16 @@ +@article{Schiffels2016, + title = {Test}, +} + +@book{TestBook2, + title = {TestBook}, +} + +@article{TestPaper1, + title = {TestPaper}, +} + +@article{Wang2020, + title = {Test}, +} + diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.geno b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.geno new file mode 100644 index 00000000..cef489e7 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.geno @@ -0,0 +1,16 @@ +000099 +022299 +009199 +101099 +119099 +002299 +002299 +212299 +201299 +999910 +999922 +999911 +999912 +999910 +999911 +999910 diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.ind b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.ind new file mode 100644 index 00000000..e9f505e9 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.ind @@ -0,0 +1,6 @@ +XXX002 F POP2 +XXX004 F POP2 +XXX005 M POP2 +XXX006 F POP2 +SAMPLE2 F 3 +SAMPLE4 F 5 diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.janno b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.janno new file mode 100644 index 00000000..c80e79c4 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.janno @@ -0,0 +1,7 @@ +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AddCol1 AddCol2 +XXX002 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a Schiffels2016 n/a n/a v1 v2 +XXX004 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a Schiffels2016 n/a n/a v1 v2 +XXX005 M POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a Schiffels2016;TestPaper1 n/a n/a v1 v2 +XXX006 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a Schiffels2016;TestPaper1 n/a n/a v1 v2 +SAMPLE2 F 3 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a Wang2020;TestPaper1 n/a n/a n/a n/a +SAMPLE4 F 5 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a Wang2020;TestPaper1;TestBook2 n/a n/a n/a n/a diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.snp b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.snp new file mode 100644 index 00000000..1d300827 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.snp @@ -0,0 +1,16 @@ +1_752566 1 0.0 752566 G A +1_842013 1 0.0 842013 T G +1_891021 1 0.0 891021 G A +1_949654 1 0.0 949654 A G +2_1018704 2 0.0 1018704 A G +2_1045331 2 0.0 1045331 G A +2_1048955 2 0.0 1048955 A G +2_1061166 2 0.0 1061166 T C +2_1108637 2 0.0 1108637 G A +rs0000 11 0.0 0 A C +rs1111 11 1.0e-3 100000 A G +rs2222 11 2.0e-3 200000 A T +rs3333 11 3.0e-3 300000 C A +rs4444 11 4.0e-3 400000 G A +rs5555 11 5.0e-3 500000 T A +rs6666 11 6.0e-3 600000 G T diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.ssf b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.ssf new file mode 100644 index 00000000..c51f729c --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/ForgePac1_vcf.ssf @@ -0,0 +1,3 @@ +poseidon_IDs udg library_built sample_accession study_accession run_accession sample_alias secondary_sample_accession first_public last_updated instrument_model library_layout library_source instrument_platform library_name library_strategy fastq_ftp fastq_aspera fastq_bytes fastq_md5 read_count submitted_ftp other_info_1 other_info_2 +XXX001;XXX002 n/a n/a n/a n/a ERR3518150 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a A B +XXX002;XXX004;XXX005 n/a n/a n/a n/a ERR3518151 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a C D diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/POSEIDON.yml b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/POSEIDON.yml new file mode 100644 index 00000000..5107eac0 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac1_vcf/POSEIDON.yml @@ -0,0 +1,14 @@ +poseidonVersion: 2.7.1 +title: ForgePac1_vcf +description: Empty package template. Please add a description +packageVersion: 0.1.0 +lastModified: 1970-01-01 +genotypeData: + format: EIGENSTRAT + genoFile: ForgePac1_vcf.geno + snpFile: ForgePac1_vcf.snp + indFile: ForgePac1_vcf.ind + snpSet: Other +jannoFile: ForgePac1_vcf.janno +sequencingSourceFile: ForgePac1_vcf.ssf +bibFile: ForgePac1_vcf.bib diff --git a/test/PoseidonGoldenTests/GoldenTestData/init/Schiffels/POSEIDON.yml b/test/PoseidonGoldenTests/GoldenTestData/init/Schiffels/POSEIDON.yml index ecec7011..1feb73d5 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/init/Schiffels/POSEIDON.yml +++ b/test/PoseidonGoldenTests/GoldenTestData/init/Schiffels/POSEIDON.yml @@ -12,11 +12,8 @@ lastModified: 1970-01-01 genotypeData: format: EIGENSTRAT genoFile: geno.txt - genoFileChkSum: 0332344057c0c4dce2ff7176f8e1103d snpFile: snp.txt - snpFileChkSum: d76e3e7a8fc0f1f5e435395424b5aeab indFile: ind.txt - indFileChkSum: f77dc756666dbfef3bb35191ae15a167 snpSet: Other jannoFile: Schiffels.janno jannoFileChkSum: fd632717ecaf337a39cfd7a828a54e99 diff --git a/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/POSEIDON.yml b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/POSEIDON.yml new file mode 100644 index 00000000..8febf145 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/POSEIDON.yml @@ -0,0 +1,10 @@ +poseidonVersion: 2.7.1 +title: Schiffels +description: Empty package template. Please add a description +packageVersion: 0.1.0 +lastModified: 1970-01-01 +genotypeData: + format: VCF + genoFile: geno.vcf +jannoFile: Schiffels.janno +bibFile: Schiffels.bib diff --git a/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/Schiffels.bib b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/Schiffels.bib new file mode 100644 index 00000000..4231eb14 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/Schiffels.bib @@ -0,0 +1,12 @@ +@article{exampleBibtexKey, + title = {Example Paper}, + author = {Doe, John}, + year = {2018}, + journal = {Example Journal}, + volume = {47}, + issue = {10}, + publisher = {The example society for example research}, + doi = {10.XXXX/ExampleJournal.47.777}, + url = {https://doi.org/10.XXXX/ExampleJournal.47.777}, +} + diff --git a/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/Schiffels.janno b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/Schiffels.janno new file mode 100644 index 00000000..eec9f6e3 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/Schiffels.janno @@ -0,0 +1,11 @@ +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords +XXX001 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX002 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX003 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX004 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX005 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX006 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX007 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX008 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX009 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX010 U unknown n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a diff --git a/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.bed b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.bed new file mode 100644 index 00000000..b6a1943c Binary files /dev/null and b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.bed differ diff --git a/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.bim b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.bim new file mode 100644 index 00000000..36183ae6 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.bim @@ -0,0 +1,9 @@ +1 1_752566 0.0 752566 G A +1 1_842013 0.0 842013 T G +1 1_891021 0.0 891021 G A +1 1_949654 0.0 949654 A G +2 2_1018704 0.0 1018704 A G +2 2_1045331 0.0 1045331 G A +2 2_1048955 0.0 1048955 A G +2 2_1061166 0.0 1061166 T C +2 2_1108637 0.0 1108637 G A diff --git a/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.fam b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.fam new file mode 100644 index 00000000..5555d65d --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.fam @@ -0,0 +1,10 @@ +unknown XXX001 0 0 0 0 +unknown XXX002 0 0 0 0 +unknown XXX003 0 0 0 0 +unknown XXX004 0 0 0 0 +unknown XXX005 0 0 0 0 +unknown XXX006 0 0 0 0 +unknown XXX007 0 0 0 0 +unknown XXX008 0 0 0 0 +unknown XXX009 0 0 0 0 +unknown XXX010 0 0 0 0 diff --git a/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.vcf b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.vcf new file mode 100644 index 00000000..413b4fb8 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/init_vcf/Schiffels_vcf/geno.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XXX001 XXX002 XXX003 XXX004 XXX005 XXX006 XXX007 XXX008 XXX009 XXX010 +1 752566 1_752566 G A 0 . . GT 0/0 1/1 1/1 1/1 1/1 1/1 1/1 0/1 1/1 1/1 +1 842013 1_842013 T G 0 . . GT 0/0 1/1 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 +1 891021 1_891021 G A 0 . . GT 1/1 1/1 1/1 1/1 ./. 0/1 1/1 1/1 1/1 1/1 +1 949654 1_949654 A G 0 . . GT 1/1 0/1 0/1 1/1 0/1 1/1 1/1 1/1 1/1 1/1 +2 1018704 2_1018704 A G 0 . . GT 0/1 0/1 0/0 0/1 ./. 1/1 0/1 0/0 1/1 0/1 +2 1045331 2_1045331 G A 0 . . GT 1/1 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0 +2 1048955 2_1048955 A G 0 . . GT 1/1 1/1 ./. 1/1 0/0 0/0 0/1 0/0 0/0 0/1 +2 1061166 2_1061166 T C 0 . . GT 0/0 0/0 1/1 0/1 0/0 0/0 0/1 0/0 0/0 1/1 +2 1108637 2_1108637 G A 0 . . GT 0/0 0/0 ./. 1/1 0/1 0/0 0/0 0/0 0/1 0/0 diff --git a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/POSEIDON_gzipped.yml b/test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/POSEIDON_gzipped.yml deleted file mode 100644 index b91ec3e8..00000000 --- a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/POSEIDON_gzipped.yml +++ /dev/null @@ -1,16 +0,0 @@ -poseidonVersion: 2.5.0 -title: Lamnidis_2018 -description: Genetic data published in Lamnidis et al. 2018 - updated version -contributor: - - name: Thiseas Lamnidis - email: lamnidis@institute.org -packageVersion: 1.0.0 -lastModified: 2019-01-15 -bibFile: sources.bib -genotypeData: - format: EIGENSTRAT - genoFile: geno.txt.gz - snpFile: snp.txt.gz - indFile: ind.txt - snpSet: Other -jannoFile: Lamnidis_2018.janno diff --git a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/POSEIDON_nobib.yml b/test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/POSEIDON_nobib.yml deleted file mode 100644 index 31942ffd..00000000 --- a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/POSEIDON_nobib.yml +++ /dev/null @@ -1,15 +0,0 @@ -poseidonVersion: 2.5.0 -title: Lamnidis_2018 -description: Genetic data published in Lamnidis et al. 2018 - updated version -contributor: - - name: Thiseas Lamnidis - email: lamnidis@institute.org -packageVersion: 1.0.0 -lastModified: 2019-01-15 -genotypeData: - format: EIGENSTRAT - genoFile: geno.txt - snpFile: snp.txt - indFile: ind.txt - snpSet: Other -jannoFile: Lamnidis_2018.janno diff --git a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Schiffels-1.1.1/POSEIDON.yml b/test/PoseidonGoldenTests/GoldenTestData/timetravel/Schiffels-1.1.1/POSEIDON.yml index ecec7011..1feb73d5 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Schiffels-1.1.1/POSEIDON.yml +++ b/test/PoseidonGoldenTests/GoldenTestData/timetravel/Schiffels-1.1.1/POSEIDON.yml @@ -12,11 +12,8 @@ lastModified: 1970-01-01 genotypeData: format: EIGENSTRAT genoFile: geno.txt - genoFileChkSum: 0332344057c0c4dce2ff7176f8e1103d snpFile: snp.txt - snpFileChkSum: d76e3e7a8fc0f1f5e435395424b5aeab indFile: ind.txt - indFileChkSum: f77dc756666dbfef3bb35191ae15a167 snpSet: Other jannoFile: Schiffels.janno jannoFileChkSum: fd632717ecaf337a39cfd7a828a54e99 diff --git a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Wang_2020-0.1.0/POSEIDON_otherPlinkEncoding.yml b/test/PoseidonGoldenTests/GoldenTestData/timetravel/Wang_2020-0.1.0/POSEIDON_otherPlinkEncoding.yml deleted file mode 100644 index efae14e9..00000000 --- a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Wang_2020-0.1.0/POSEIDON_otherPlinkEncoding.yml +++ /dev/null @@ -1,17 +0,0 @@ -poseidonVersion: 2.7.1 -title: Wang_2020 -description: Genetic data published in Wang et al. 2020, Plink test -contributor: - - name: Ke Wang - email: wang@institute.org -packageVersion: 0.1.0 -lastModified: 2020-05-20 -bibFile: sources.bib -genotypeData: - format: PLINK - genoFile: Wang_2020.bed - snpFile: Wang_2020.bim - indFile: Wang_2020_otherPlinkEncoding.fam - snpSet: Other -jannoFile: Wang_2020.janno -sequencingSourceFile: Wang_2020.ssf \ No newline at end of file diff --git a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Wang_2020-0.1.0/Wang_2020_otherPlinkEncoding.fam b/test/PoseidonGoldenTests/GoldenTestData/timetravel/Wang_2020-0.1.0/Wang_2020_otherPlinkEncoding.fam deleted file mode 100644 index a71cd5c2..00000000 --- a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Wang_2020-0.1.0/Wang_2020_otherPlinkEncoding.fam +++ /dev/null @@ -1,5 +0,0 @@ - 2 SAMPLE0 0 0 2 1 - 2 SAMPLE1 0 0 1 2 - 1 SAMPLE2 0 0 2 3 - 1 SAMPLE3 0 0 1 4 - 1 SAMPLE4 0 0 2 5 diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index 76cbbf79..41b0027a 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -34,7 +34,7 @@ import Poseidon.EntityTypes (EntityInput (..), readEntitiesFromString) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), - GenotypeFormatSpec (..), + GenotypeFileSpec (..), SNPSetSpec (..)) import Poseidon.ServerClient (AddJannoColSpec (..), ArchiveEndpoint (..)) @@ -81,6 +81,8 @@ dynamicCheckSumFile :: FilePath dynamicCheckSumFile = "/tmp/poseidon_trident_dynamicCheckSumFile.txt" testPacsDir :: FilePath testPacsDir = "test/testDat/testPackages/ancient" +testPacsDirOther :: FilePath +testPacsDirOther = "test/testDat/testPackages/other_test_packages" testEntityFiles :: FilePath testEntityFiles = "test/testDat/testEntityFiles" @@ -203,14 +205,15 @@ testPipelineInit :: FilePath -> FilePath -> IO () testPipelineInit testDir checkFilePath = do let initOpts1 = InitOptions { _initGenoData = GenotypeDataSpec { - format = GenotypeFormatEigenstrat - , genoFile = testPacsDir "Schiffels_2016" "geno.txt" - , genoFileChkSum = Nothing - , snpFile = testPacsDir "Schiffels_2016" "snp.txt" - , snpFileChkSum = Nothing - , indFile = testPacsDir "Schiffels_2016" "ind.txt" - , indFileChkSum = Nothing - , snpSet = Just SNPSetOther + genotypeFileSpec = GenotypeEigenstrat { + _esGenoFile = testPacsDir "Schiffels_2016" "geno.txt" + , _esGenoFileChkSum = Nothing + , _esSnpFile = testPacsDir "Schiffels_2016" "snp.txt" + , _esSnpFileChkSum = Nothing + , _esIndFile = testPacsDir "Schiffels_2016" "ind.txt" + , _esIndFileChkSum = Nothing + } + , genotypeSnpSet = Just SNPSetOther } , _initPacPath = testDir "init" "Schiffels" , _initPacName = Just "Schiffels" @@ -224,16 +227,18 @@ testPipelineInit testDir checkFilePath = do , "init" "Schiffels" "geno.txt" , "init" "Schiffels" "Schiffels.bib" ] + let initOpts2 = InitOptions { _initGenoData = GenotypeDataSpec { - format = GenotypeFormatPlink - , genoFile = testPacsDir "Wang_2020" "Wang_2020.bed" - , genoFileChkSum = Nothing - , snpFile = testPacsDir "Wang_2020" "Wang_2020.bim" - , snpFileChkSum = Nothing - , indFile = testPacsDir "Wang_2020" "Wang_2020.fam" - , indFileChkSum = Nothing - , snpSet = Just SNPSetOther + genotypeFileSpec = GenotypePlink { + _plGenoFile = testPacsDir "Wang_2020" "Wang_2020.bed" + , _plGenoFileChkSum = Nothing + , _plSnpFile = testPacsDir "Wang_2020" "Wang_2020.bim" + , _plSnpFileChkSum = Nothing + , _plIndFile = testPacsDir "Wang_2020" "Wang_2020.fam" + , _plIndFileChkSum = Nothing + } + , genotypeSnpSet = Just SNPSetOther } , _initPacPath = testDir "init" "Wang" , _initPacName = Nothing @@ -246,6 +251,28 @@ testPipelineInit testDir checkFilePath = do , "init" "Wang" "Wang_2020.bed" ] + let initOpts3 = InitOptions { + _initGenoData = GenotypeDataSpec { + genotypeFileSpec = GenotypeVCF { + _vcfGenoFile = testPacsDirOther "Schiffels_2016_vcf" "geno.vcf" + , _vcfGenoFileChkSum = Nothing + } + , genotypeSnpSet = Just SNPSetOther + } + , _initPacPath = testDir "init_vcf" "Schiffels_vcf" + , _initPacName = Just "Schiffels" + , _initMinimal = False + } + let action3 = testLog (runInit initOpts3) >> + patchLastModified testDir ("init_vcf" "Schiffels_vcf" "POSEIDON.yml") + runAndChecksumFiles checkFilePath testDir action3 "init" [ + "init_vcf" "Schiffels_vcf" "POSEIDON.yml" + , "init_vcf" "Schiffels_vcf" "Schiffels.janno" + , "init_vcf" "Schiffels_vcf" "geno.vcf" + , "init_vcf" "Schiffels_vcf" "Schiffels.bib" + ] + + testPipelineValidate :: FilePath -> FilePath -> IO () testPipelineValidate testDir checkFilePath = do let validateOpts1 = ValidateOptions { @@ -298,14 +325,15 @@ testPipelineValidate testDir checkFilePath = do } & run 5 validateOpts1 { _validatePlan = ValPlanGeno $ GenotypeDataSpec { - format = GenotypeFormatEigenstrat - , genoFile = testPacsDir "Schiffels_2016" "geno.txt" - , genoFileChkSum = Nothing - , snpFile = testPacsDir "Schiffels_2016" "snp.txt" - , snpFileChkSum = Nothing - , indFile = testPacsDir "Schiffels_2016" "ind.txt" - , indFileChkSum = Nothing - , snpSet = Nothing + genotypeFileSpec = GenotypeEigenstrat { + _esGenoFile = testPacsDir "Schiffels_2016" "geno.txt" + , _esGenoFileChkSum = Nothing + , _esSnpFile = testPacsDir "Schiffels_2016" "snp.txt" + , _esSnpFileChkSum = Nothing + , _esIndFile = testPacsDir "Schiffels_2016" "ind.txt" + , _esIndFileChkSum = Nothing + } + , genotypeSnpSet = Nothing } } & run 6 validateOpts1 { @@ -389,7 +417,7 @@ testPipelineGenoconvert :: FilePath -> FilePath -> IO () testPipelineGenoconvert testDir checkFilePath = do let genoconvertOpts1 = GenoconvertOptions { _genoconvertGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016"] - , _genoConvertOutFormat = GenotypeFormatPlink + , _genoConvertOutFormat = "PLINK" , _genoConvertOutOnlyGeno = False , _genoMaybeOutPackagePath = Just $ testDir "genoconvert" "Schiffels" , _genoconvertRemoveOld = False @@ -401,9 +429,10 @@ testPipelineGenoconvert testDir checkFilePath = do , "genoconvert" "Schiffels" "Schiffels_2016.bim" , "genoconvert" "Schiffels" "Schiffels_2016.fam" ] + let genoconvertOpts2 = GenoconvertOptions { _genoconvertGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016"] - , _genoConvertOutFormat = GenotypeFormatPlink + , _genoConvertOutFormat = "PLINK" , _genoConvertOutOnlyGeno = False , _genoMaybeOutPackagePath = Just $ testDir "genoconvert" "Schiffels_otherPlinkEncoding" , _genoconvertRemoveOld = False @@ -415,10 +444,11 @@ testPipelineGenoconvert testDir checkFilePath = do , "genoconvert" "Schiffels_otherPlinkEncoding" "Schiffels_2016.bim" , "genoconvert" "Schiffels_otherPlinkEncoding" "Schiffels_2016.fam" ] + -- in-place conversion let genoconvertOpts3 = GenoconvertOptions { _genoconvertGenoSources = [PacBaseDir $ testDir "init" "Wang"] - , _genoConvertOutFormat = GenotypeFormatEigenstrat + , _genoConvertOutFormat = "EIGENSTRAT" , _genoConvertOutOnlyGeno = False , _genoMaybeOutPackagePath = Nothing , _genoconvertRemoveOld = False @@ -430,21 +460,23 @@ testPipelineGenoconvert testDir checkFilePath = do , "init" "Wang" "Wang.snp" , "init" "Wang" "Wang.ind" ] + let genoconvertOpts4 = GenoconvertOptions { _genoconvertGenoSources = [ GenoDirect $ GenotypeDataSpec { - format = GenotypeFormatEigenstrat - , genoFile = testDir "init" "Schiffels" "geno.txt" - , genoFileChkSum = Nothing - , snpFile = testDir "init" "Schiffels" "snp.txt" - , snpFileChkSum = Nothing - , indFile = testDir "init" "Schiffels" "ind.txt" - , indFileChkSum = Nothing - , snpSet = Just SNPSetOther + genotypeFileSpec = GenotypeEigenstrat { + _esGenoFile = testDir "init" "Schiffels" "geno.txt" + , _esGenoFileChkSum = Nothing + , _esSnpFile = testDir "init" "Schiffels" "snp.txt" + , _esSnpFileChkSum = Nothing + , _esIndFile = testDir "init" "Schiffels" "ind.txt" + , _esIndFileChkSum = Nothing + } + , genotypeSnpSet = Just SNPSetOther } ] - , _genoConvertOutFormat = GenotypeFormatPlink + , _genoConvertOutFormat = "PLINK" , _genoConvertOutOnlyGeno = True , _genoMaybeOutPackagePath = Nothing , _genoconvertRemoveOld = False @@ -457,6 +489,30 @@ testPipelineGenoconvert testDir checkFilePath = do , "init" "Schiffels" "geno.fam" ] + let genoconvertOpts5 = GenoconvertOptions { + _genoconvertGenoSources = [ + GenoDirect $ + GenotypeDataSpec { + genotypeFileSpec = GenotypeVCF { + _vcfGenoFile = testDir "init_vcf" "Schiffels_vcf" "geno.vcf" + , _vcfGenoFileChkSum = Nothing + } + , genotypeSnpSet = Just SNPSetOther + } + ] + , _genoConvertOutFormat = "PLINK" + , _genoConvertOutOnlyGeno = True + , _genoMaybeOutPackagePath = Nothing + , _genoconvertRemoveOld = False + , _genoconvertOutPlinkPopMode = PlinkPopNameAsFamily + , _genoconvertOnlyLatest = False + } + runAndChecksumFiles checkFilePath testDir (testLog $ runGenoconvert genoconvertOpts5) "genoconvert" [ + "init_vcf" "Schiffels_vcf" "geno.bed" + , "init_vcf" "Schiffels_vcf" "geno.bim" + , "init_vcf" "Schiffels_vcf" "geno.fam" + ] + testPipelineRectify :: FilePath -> FilePath -> IO () testPipelineRectify testDir checkFilePath = do let rectifyOpts1 = RectifyOptions { @@ -513,7 +569,7 @@ testPipelineForge testDir checkFilePath = do , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP2,,")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac1" , _forgeOutPacName = Just "ForgePac1" @@ -529,13 +585,37 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac1" "ForgePac1.ssf" , "forge" "ForgePac1" "ForgePac1.bib" ] + + -- forge test 1 with VCF + let forgeOpts1vcf = ForgeOptions { + _forgeGenoSources = [PacBaseDir $ testPacsDirOther "Schiffels_2016_vcf", PacBaseDir $ testPacsDir "Wang_2020"] + , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP2,,")] + , _forgeSnpFile = Nothing + , _forgeIntersect = False + , _forgeOutFormat = "EIGENSTRAT" + , _forgeOutMode = NormalOut + , _forgeOutPacPath = testDir "forge" "ForgePac1_vcf" + , _forgeOutPacName = Just "ForgePac1_vcf" + , _forgePackageWise = False + , _forgeOutputPlinkPopMode = PlinkPopNameAsFamily + , _forgeOutputOrdered = False + } + let action1vcf = testLog (runForge forgeOpts1vcf) >> patchLastModified testDir ("forge" "ForgePac1_vcf" "POSEIDON.yml") + runAndChecksumFiles checkFilePath testDir action1vcf "forge" [ + "forge" "ForgePac1_vcf" "POSEIDON.yml" + , "forge" "ForgePac1_vcf" "ForgePac1_vcf.geno" + , "forge" "ForgePac1_vcf" "ForgePac1_vcf.janno" + , "forge" "ForgePac1_vcf" "ForgePac1_vcf.ssf" + , "forge" "ForgePac1_vcf" "ForgePac1_vcf.bib" + ] + -- forge test 2 let forgeOpts2 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016", PacBaseDir $ testPacsDir "Wang_2020"] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP2,,,-")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatPlink + , _forgeOutFormat = "PLINK" , _forgeOutMode = MinimalOut , _forgeOutPacPath = testDir "forge" "ForgePac2" , _forgeOutPacName = Nothing @@ -548,13 +628,14 @@ testPipelineForge testDir checkFilePath = do "forge" "ForgePac2" "POSEIDON.yml" , "forge" "ForgePac2" "ForgePac2.bed" ] + -- forge test 3 let forgeOpts3 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016", PacBaseDir $ testPacsDir "Wang_2020"] , _forgeEntityInput = [EntitiesFromFile (testEntityFiles "goldenTestForgeFile1.txt")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac3" , _forgeOutPacName = Nothing @@ -571,13 +652,14 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac3" "ForgePac3.janno" , "forge" "ForgePac3" "ForgePac3.ssf" ] + -- forge test 4 let forgeOpts4 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016", PacBaseDir $ testPacsDir "Wang_2020"] , _forgeEntityInput = [EntitiesFromFile (testEntityFiles "goldenTestForgeFile2.txt")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatPlink + , _forgeOutFormat = "PLINK" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac4" , _forgeOutPacName = Nothing @@ -594,12 +676,13 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac4" "ForgePac4.janno" , "forge" "ForgePac4" "ForgePac4.ssf" ] + -- forge test 5 let forgeOpts5 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016", PacBaseDir $ testPacsDir "Wang_2020"] , _forgeEntityInput = [] , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac5" , _forgeOutPacName = Just "ForgePac5" @@ -615,35 +698,38 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac5" "ForgePac5.janno" , "forge" "ForgePac5" "ForgePac5.ssf" ] + -- forge test 6 (direct genotype data input interface) let forgeOpts6 = ForgeOptions { _forgeGenoSources = [ GenoDirect $ GenotypeDataSpec { - format = GenotypeFormatEigenstrat - , genoFile = testPacsDir "Schiffels_2016" "geno.txt" - , genoFileChkSum = Nothing - , snpFile = testPacsDir "Schiffels_2016" "snp.txt" - , snpFileChkSum = Nothing - , indFile = testPacsDir "Schiffels_2016" "ind.txt" - , indFileChkSum = Nothing - , snpSet = Just SNPSetOther + genotypeFileSpec = GenotypeEigenstrat { + _esGenoFile = testPacsDir "Schiffels_2016" "geno.txt" + , _esGenoFileChkSum = Nothing + , _esSnpFile = testPacsDir "Schiffels_2016" "snp.txt" + , _esSnpFileChkSum = Nothing + , _esIndFile = testPacsDir "Schiffels_2016" "ind.txt" + , _esIndFileChkSum = Nothing + } + , genotypeSnpSet = Just SNPSetOther }, GenoDirect $ GenotypeDataSpec { - format = GenotypeFormatPlink - , genoFile = testPacsDir "Wang_2020" "Wang_2020.bed" - , genoFileChkSum = Nothing - , snpFile = testPacsDir "Wang_2020" "Wang_2020.bim" - , snpFileChkSum = Nothing - , indFile = testPacsDir "Wang_2020" "Wang_2020.fam" - , indFileChkSum = Nothing - , snpSet = Just SNPSetOther + genotypeFileSpec = GenotypePlink { + _plGenoFile = testPacsDir "Wang_2020" "Wang_2020.bed" + , _plGenoFileChkSum = Nothing + , _plSnpFile = testPacsDir "Wang_2020" "Wang_2020.bim" + , _plSnpFileChkSum = Nothing + , _plIndFile = testPacsDir "Wang_2020" "Wang_2020.fam" + , _plIndFileChkSum = Nothing + } + , genotypeSnpSet = Just SNPSetOther } ] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP2,,")] , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = GenoOut , _forgeOutPacPath = testDir "forge" "ForgePac6" , _forgeOutPacName = Just "ForgePac6" @@ -658,25 +744,27 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac6" "ForgePac6.snp" , "forge" "ForgePac6" "ForgePac6.ind" ] + -- forge test 7 (mixed data input interface) let forgeOpts7 = ForgeOptions { _forgeGenoSources = [ PacBaseDir $ testPacsDir "Schiffels_2016", GenoDirect $ GenotypeDataSpec { - format = GenotypeFormatPlink - , genoFile = testPacsDir "Wang_2020" "Wang_2020.bed" - , genoFileChkSum = Nothing - , snpFile = testPacsDir "Wang_2020" "Wang_2020.bim" - , snpFileChkSum = Nothing - , indFile = testPacsDir "Wang_2020" "Wang_2020.fam" - , indFileChkSum = Nothing - , snpSet = Just SNPSetOther + genotypeFileSpec = GenotypePlink { + _plGenoFile = testPacsDir "Wang_2020" "Wang_2020.bed" + , _plGenoFileChkSum = Nothing + , _plSnpFile = testPacsDir "Wang_2020" "Wang_2020.bim" + , _plSnpFileChkSum = Nothing + , _plIndFile = testPacsDir "Wang_2020" "Wang_2020.fam" + , _plIndFileChkSum = Nothing + } + , genotypeSnpSet = Just SNPSetOther } ] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP2,,")] , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac7" , _forgeOutPacName = Just "ForgePac7" @@ -692,13 +780,14 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac7" "ForgePac7.ind" , "forge" "ForgePac7" "ForgePac7.janno" ] + -- forge test 8 (combining additional janno columns from separate source janno files) let forgeOpts8 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016", PacBaseDir $ testPacsDir "Lamnidis_2018_newVersion"] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString ",")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac8" , _forgeOutPacName = Just "ForgePac8" @@ -711,13 +800,14 @@ testPipelineForge testDir checkFilePath = do "forge" "ForgePac8" "ForgePac8.janno" , "forge" "ForgePac8" "ForgePac8.ssf" ] + -- forge test 9 (duplicates are handled correctly if an individual is properly specified) let forgeOpts9 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016", PacBaseDir $ testPacsDir "Schmid_2028"] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP1,-")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac9" , _forgeOutPacName = Just "ForgePac9" @@ -731,13 +821,14 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac9" "ForgePac9.janno" , "forge" "ForgePac9" "ForgePac9.ssf" ] + -- forge test 10 (duplicates can also be resolved with negative selection) let forgeOpts10 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016", PacBaseDir $ testPacsDir "Schmid_2028"] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "-,-")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac10" , _forgeOutPacName = Just "ForgePac10" @@ -752,13 +843,14 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac10" "ForgePac10.ssf" , "forge" "ForgePac10" "ForgePac10.bib" ] + -- forge test 11 (--packagewise works as expected) let forgeOpts11 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016", PacBaseDir $ testPacsDir "Schmid_2028"] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP3")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac11" , _forgeOutPacName = Just "ForgePac11" @@ -772,13 +864,14 @@ testPipelineForge testDir checkFilePath = do , "forge" "ForgePac11" "ForgePac11.janno" , "forge" "ForgePac11" "ForgePac11.ssf" ] + -- simple package version selection let forgeOpts12 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "*Lamnidis_2018-1.0.0*")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac12" , _forgeOutPacName = Just "ForgePac12" @@ -790,13 +883,14 @@ testPipelineForge testDir checkFilePath = do runAndChecksumFiles checkFilePath testDir action12 "forge" [ "forge" "ForgePac12" "ForgePac12.ind" ] + -- merge an explicitly versioned package with another package let forgeOpts13 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "*Lamnidis_2018-1.0.1*,*Schiffels_2016*")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac13" , _forgeOutPacName = Just "ForgePac13" @@ -808,6 +902,7 @@ testPipelineForge testDir checkFilePath = do runAndChecksumFiles checkFilePath testDir action13 "forge" [ "forge" "ForgePac13" "ForgePac13.janno" ] + -- use the SpecificInd interface to merge individuals from the same package across different versions let forgeOpts14 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir] @@ -815,7 +910,7 @@ testPipelineForge testDir checkFilePath = do readEntitiesFromString ",")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac14" , _forgeOutPacName = Just "ForgePac14" @@ -827,6 +922,7 @@ testPipelineForge testDir checkFilePath = do runAndChecksumFiles checkFilePath testDir action14 "forge" [ "forge" "ForgePac14" "ForgePac14.janno" ] + -- -- negative selection with different package versions - use versioned to cancel versioned let forgeOpts15 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir] @@ -834,7 +930,7 @@ testPipelineForge testDir checkFilePath = do readEntitiesFromString "*Lamnidis_2018-1.0.1*,-*Lamnidis_2018-1.0.1*,*Lamnidis_2018-1.0.0*")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac15" , _forgeOutPacName = Just "ForgePac15" @@ -846,6 +942,7 @@ testPipelineForge testDir checkFilePath = do runAndChecksumFiles checkFilePath testDir action15 "forge" [ "forge" "ForgePac15" "ForgePac15.janno" ] + -- negative selection with different package versions - use unversioned to cancel versioned let forgeOpts16 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir] @@ -853,7 +950,7 @@ testPipelineForge testDir checkFilePath = do readEntitiesFromString "*Lamnidis_2018-1.0.1*,-*Lamnidis_2018*,*Lamnidis_2018-1.0.0*")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac16" , _forgeOutPacName = Just "ForgePac16" @@ -865,6 +962,7 @@ testPipelineForge testDir checkFilePath = do runAndChecksumFiles checkFilePath testDir action16 "forge" [ "forge" "ForgePac16" "ForgePac16.janno" ] + -- negative selection with different package versions - using the SpecificInd interface let forgeOpts17 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir] @@ -872,7 +970,7 @@ testPipelineForge testDir checkFilePath = do readEntitiesFromString "*Lamnidis_2018-1.0.1*,-POP2,-,-")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac17" , _forgeOutPacName = Just "ForgePac17" @@ -884,12 +982,13 @@ testPipelineForge testDir checkFilePath = do runAndChecksumFiles checkFilePath testDir action17 "forge" [ "forge" "ForgePac17" "ForgePac17.janno" ] + let forgeOpts18 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016"] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP3,,,")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatPlink + , _forgeOutFormat = "PLINK" , _forgeOutMode = NormalOut , _forgeOutPacPath = testDir "forge" "ForgePac18" , _forgeOutPacName = Just "ForgePac18" @@ -903,12 +1002,13 @@ testPipelineForge testDir checkFilePath = do "forge" "ForgePac18" "ForgePac18.fam", "forge" "ForgePac18" "ForgePac18.bed" ] + let forgeOpts19 = ForgeOptions { _forgeGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016"] , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "")] , _forgeSnpFile = Nothing , _forgeIntersect = False - , _forgeOutFormat = GenotypeFormatPlink + , _forgeOutFormat = "PLINK" , _forgeOutMode = PreservePymlOut , _forgeOutPacPath = testDir "forge" "ForgePac19" , _forgeOutPacName = Just "ForgePac19" diff --git a/test/testDat/.gitignore b/test/testDat/.gitignore index 4354b3fc..6fef6631 100644 --- a/test/testDat/.gitignore +++ b/test/testDat/.gitignore @@ -1,2 +1,2 @@ poseidonHSGoldenTestData/2019_Nikitin_LBK/*.bed -poseidonHSGoldenTestData/2019_Nikitin_LBK/*.bim +poseidonHSGoldenTestData/2019_Nikitin_LBK/*.bim \ No newline at end of file diff --git a/test/testDat/testPackages/ancient/Lamnidis_2018/.DS_Store b/test/testDat/testPackages/ancient/Lamnidis_2018/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/test/testDat/testPackages/ancient/Lamnidis_2018/.DS_Store differ diff --git a/test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON_gzipped.yml b/test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON_gzipped.yml deleted file mode 100644 index b91ec3e8..00000000 --- a/test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON_gzipped.yml +++ /dev/null @@ -1,16 +0,0 @@ -poseidonVersion: 2.5.0 -title: Lamnidis_2018 -description: Genetic data published in Lamnidis et al. 2018 - updated version -contributor: - - name: Thiseas Lamnidis - email: lamnidis@institute.org -packageVersion: 1.0.0 -lastModified: 2019-01-15 -bibFile: sources.bib -genotypeData: - format: EIGENSTRAT - genoFile: geno.txt.gz - snpFile: snp.txt.gz - indFile: ind.txt - snpSet: Other -jannoFile: Lamnidis_2018.janno diff --git a/test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON_nobib.yml b/test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON_nobib.yml deleted file mode 100644 index 31942ffd..00000000 --- a/test/testDat/testPackages/ancient/Lamnidis_2018/POSEIDON_nobib.yml +++ /dev/null @@ -1,15 +0,0 @@ -poseidonVersion: 2.5.0 -title: Lamnidis_2018 -description: Genetic data published in Lamnidis et al. 2018 - updated version -contributor: - - name: Thiseas Lamnidis - email: lamnidis@institute.org -packageVersion: 1.0.0 -lastModified: 2019-01-15 -genotypeData: - format: EIGENSTRAT - genoFile: geno.txt - snpFile: snp.txt - indFile: ind.txt - snpSet: Other -jannoFile: Lamnidis_2018.janno diff --git a/test/testDat/testPackages/ancient/Lamnidis_2018/geno.txt.gz b/test/testDat/testPackages/ancient/Lamnidis_2018/geno.txt.gz deleted file mode 100644 index 1b08510a..00000000 Binary files a/test/testDat/testPackages/ancient/Lamnidis_2018/geno.txt.gz and /dev/null differ diff --git a/test/testDat/testPackages/ancient/Lamnidis_2018/snp.txt.gz b/test/testDat/testPackages/ancient/Lamnidis_2018/snp.txt.gz deleted file mode 100644 index deb1db25..00000000 Binary files a/test/testDat/testPackages/ancient/Lamnidis_2018/snp.txt.gz and /dev/null differ diff --git a/test/testDat/testPackages/ancient/Schiffels_2016/.DS_Store b/test/testDat/testPackages/ancient/Schiffels_2016/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/test/testDat/testPackages/ancient/Schiffels_2016/.DS_Store differ diff --git a/test/testDat/testPackages/ancient/Wang_2020/.DS_Store b/test/testDat/testPackages/ancient/Wang_2020/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/test/testDat/testPackages/ancient/Wang_2020/.DS_Store differ diff --git a/test/testDat/testPackages/ancient/Wang_2020/POSEIDON_otherPlinkEncoding.yml b/test/testDat/testPackages/ancient/Wang_2020/POSEIDON_otherPlinkEncoding.yml deleted file mode 100644 index efae14e9..00000000 --- a/test/testDat/testPackages/ancient/Wang_2020/POSEIDON_otherPlinkEncoding.yml +++ /dev/null @@ -1,17 +0,0 @@ -poseidonVersion: 2.7.1 -title: Wang_2020 -description: Genetic data published in Wang et al. 2020, Plink test -contributor: - - name: Ke Wang - email: wang@institute.org -packageVersion: 0.1.0 -lastModified: 2020-05-20 -bibFile: sources.bib -genotypeData: - format: PLINK - genoFile: Wang_2020.bed - snpFile: Wang_2020.bim - indFile: Wang_2020_otherPlinkEncoding.fam - snpSet: Other -jannoFile: Wang_2020.janno -sequencingSourceFile: Wang_2020.ssf \ No newline at end of file diff --git a/test/testDat/testPackages/ancient/Wang_2020/Wang_2020_otherPlinkEncoding.fam b/test/testDat/testPackages/ancient/Wang_2020/Wang_2020_otherPlinkEncoding.fam deleted file mode 100644 index a71cd5c2..00000000 --- a/test/testDat/testPackages/ancient/Wang_2020/Wang_2020_otherPlinkEncoding.fam +++ /dev/null @@ -1,5 +0,0 @@ - 2 SAMPLE0 0 0 2 1 - 2 SAMPLE1 0 0 1 2 - 1 SAMPLE2 0 0 2 3 - 1 SAMPLE3 0 0 1 4 - 1 SAMPLE4 0 0 2 5 diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/.DS_Store b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/.DS_Store differ diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/Lamnidis_2018.janno b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/Lamnidis_2018.janno new file mode 100755 index 00000000..b344f4f2 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/Lamnidis_2018.janno @@ -0,0 +1,11 @@ +Poseidon_ID Group_Name Genetic_Sex Publication +XXX011 POP1 M Lamnidis2018 +XXX012 POP2 F Lamnidis2018 +XXX013 POP1 M Lamnidis2018 +XXX014 POP2 F Lamnidis2018 +XXX015 POP2 M Lamnidis2018 +XXX016 POP2 F Lamnidis2018 +XXX017 POP1 M Lamnidis2018 +XXX018 POP3 F Lamnidis2018 +XXX019 POP1 F Lamnidis2018 +XXX099 POP3 M Lamnidis2018 diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Lamnidis_2018/POSEIDON_gzipped.yml b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/POSEIDON.yml similarity index 100% rename from test/PoseidonGoldenTests/GoldenTestData/chronicle/Lamnidis_2018/POSEIDON_gzipped.yml rename to test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/POSEIDON.yml diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Lamnidis_2018/geno.txt.gz b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/geno.txt.gz similarity index 100% rename from test/PoseidonGoldenTests/GoldenTestData/chronicle/Lamnidis_2018/geno.txt.gz rename to test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/geno.txt.gz diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/ind.txt b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/ind.txt new file mode 100644 index 00000000..d742cb30 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/ind.txt @@ -0,0 +1,10 @@ +XXX011 M POP1 +XXX012 F POP2 +XXX013 M POP1 +XXX014 F POP2 +XXX015 M POP2 +XXX016 F POP2 +XXX017 M POP1 +XXX018 F POP3 +XXX019 F POP1 +XXX099 M POP3 diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Lamnidis_2018/snp.txt.gz b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/snp.txt.gz similarity index 100% rename from test/PoseidonGoldenTests/GoldenTestData/chronicle/Lamnidis_2018/snp.txt.gz rename to test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/snp.txt.gz diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/sources.bib b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/sources.bib new file mode 100644 index 00000000..47e978d9 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_gzipped/sources.bib @@ -0,0 +1,3 @@ +@article{Lamnidis2018, + title = Test +} diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/.DS_Store b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/.DS_Store differ diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/Lamnidis_2018.janno b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/Lamnidis_2018.janno new file mode 100755 index 00000000..b344f4f2 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/Lamnidis_2018.janno @@ -0,0 +1,11 @@ +Poseidon_ID Group_Name Genetic_Sex Publication +XXX011 POP1 M Lamnidis2018 +XXX012 POP2 F Lamnidis2018 +XXX013 POP1 M Lamnidis2018 +XXX014 POP2 F Lamnidis2018 +XXX015 POP2 M Lamnidis2018 +XXX016 POP2 F Lamnidis2018 +XXX017 POP1 M Lamnidis2018 +XXX018 POP3 F Lamnidis2018 +XXX019 POP1 F Lamnidis2018 +XXX099 POP3 M Lamnidis2018 diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Lamnidis_2018/POSEIDON_nobib.yml b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/POSEIDON.yml similarity index 100% rename from test/PoseidonGoldenTests/GoldenTestData/chronicle/Lamnidis_2018/POSEIDON_nobib.yml rename to test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/POSEIDON.yml diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/geno.txt b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/geno.txt new file mode 100644 index 00000000..18e8036c --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/geno.txt @@ -0,0 +1,9 @@ +2000000100 +2022221222 +0000910000 +1101221220 +0110100000 +2222222222 +2292221221 +2201221220 +2292122212 diff --git a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/geno.txt.gz b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/geno.txt.gz similarity index 100% rename from test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/geno.txt.gz rename to test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/geno.txt.gz diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/ind.txt b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/ind.txt new file mode 100644 index 00000000..d742cb30 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/ind.txt @@ -0,0 +1,10 @@ +XXX011 M POP1 +XXX012 F POP2 +XXX013 M POP1 +XXX014 F POP2 +XXX015 M POP2 +XXX016 F POP2 +XXX017 M POP1 +XXX018 F POP3 +XXX019 F POP1 +XXX099 M POP3 diff --git a/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/snp.txt b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/snp.txt new file mode 100644 index 00000000..b37d179b --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/snp.txt @@ -0,0 +1,9 @@ +1_752566 1 0.020130 752566 G A +1_842013 1 0.022518 842013 T G +1_891021 1 0.024116 891021 G A +1_903426 1 0.024457 903426 C T +1_949654 1 0.025727 949654 A G +2_1045331 2 0.026665 1045331 G A +2_1048955 2 0.026674 1048955 A G +2_1061166 2 0.026711 1061166 T C +2_1108637 2 0.028311 1108637 G A diff --git a/test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/snp.txt.gz b/test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/snp.txt.gz similarity index 100% rename from test/PoseidonGoldenTests/GoldenTestData/timetravel/Lamnidis_2018-1.0.0/snp.txt.gz rename to test/testDat/testPackages/other_test_packages/Lamnidis_2018_nobib/snp.txt.gz diff --git a/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/.DS_Store b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/.DS_Store differ diff --git a/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/CHANGELOG.md b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/CHANGELOG.md new file mode 100644 index 00000000..c4d4c8f5 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/CHANGELOG.md @@ -0,0 +1 @@ +V 1.0.1: not specified diff --git a/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/POSEIDON.yml b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/POSEIDON.yml new file mode 100644 index 00000000..0e6c315f --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/POSEIDON.yml @@ -0,0 +1,20 @@ +poseidonVersion: 2.7.1 +title: Schiffels_2016 +description: Genetic data published in Schiffels et al. 2016 +contributor: +- name: Stephan Schiffels + email: schiffels@institute.org +- name: Josiah Carberry + email: carberry@brown.edu + orcid: 0000-0002-1825-0097 +packageVersion: 1.0.1 +lastModified: 2021-11-09 +genotypeData: + format: VCF + genoFile: geno.vcf + snpSet: Other +jannoFile: Schiffels_2016.janno +sequencingSourceFile: ena_table.ssf +bibFile: sources.bib +readmeFile: README.md +changelogFile: CHANGELOG.md diff --git a/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/README.md b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/README.md new file mode 100644 index 00000000..af27ff49 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/README.md @@ -0,0 +1 @@ +This is a test file. \ No newline at end of file diff --git a/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/Schiffels_2016.janno b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/Schiffels_2016.janno new file mode 100755 index 00000000..69d7e0cd --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/Schiffels_2016.janno @@ -0,0 +1,11 @@ +Poseidon_ID Group_Name Genetic_Sex Publication AddCol1 AddCol2 +XXX001 POP1 M Schiffels2016 v1 v2 +XXX002 POP2 F Schiffels2016 v1 v2 +XXX003 POP1 M Schiffels2016 v1 v2 +XXX004 POP2 F Schiffels2016 v1 v2 +XXX005 POP2 M Schiffels2016;TestPaper1 v1 v2 +XXX006 POP2 F Schiffels2016;TestPaper1 v1 v2 +XXX007 POP1 M Schiffels2016;TestBook1 v1 v2 +XXX008 POP3 F Schiffels2016;TestBook1 v1 v2 +XXX009 POP1 F Schiffels2016;TestPaper1;TestBook1 v1 v2 +XXX010 POP3 M Schiffels2016;TestPaper1;TestBook1 v1 v2 diff --git a/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/ena_table.ssf b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/ena_table.ssf new file mode 100644 index 00000000..4bde27b8 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/ena_table.ssf @@ -0,0 +1,5 @@ +poseidon_IDs run_accession other_info_1 other_info_2 +XXX001;XXX002 ERR3518150 A B +XXX002;XXX004;XXX005 ERR3518151 C D +XXX003 ERR3518152 E F +XXX001 ERR3518153 G H diff --git a/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/geno.vcf b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/geno.vcf new file mode 100644 index 00000000..413b4fb8 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/geno.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XXX001 XXX002 XXX003 XXX004 XXX005 XXX006 XXX007 XXX008 XXX009 XXX010 +1 752566 1_752566 G A 0 . . GT 0/0 1/1 1/1 1/1 1/1 1/1 1/1 0/1 1/1 1/1 +1 842013 1_842013 T G 0 . . GT 0/0 1/1 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 +1 891021 1_891021 G A 0 . . GT 1/1 1/1 1/1 1/1 ./. 0/1 1/1 1/1 1/1 1/1 +1 949654 1_949654 A G 0 . . GT 1/1 0/1 0/1 1/1 0/1 1/1 1/1 1/1 1/1 1/1 +2 1018704 2_1018704 A G 0 . . GT 0/1 0/1 0/0 0/1 ./. 1/1 0/1 0/0 1/1 0/1 +2 1045331 2_1045331 G A 0 . . GT 1/1 1/1 1/1 1/1 0/0 0/0 0/0 0/0 0/0 0/0 +2 1048955 2_1048955 A G 0 . . GT 1/1 1/1 ./. 1/1 0/0 0/0 0/1 0/0 0/0 0/1 +2 1061166 2_1061166 T C 0 . . GT 0/0 0/0 1/1 0/1 0/0 0/0 0/1 0/0 0/0 1/1 +2 1108637 2_1108637 G A 0 . . GT 0/0 0/0 ./. 1/1 0/1 0/0 0/0 0/0 0/1 0/0 diff --git a/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/sources.bib b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/sources.bib new file mode 100644 index 00000000..26dd8355 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/sources.bib @@ -0,0 +1,11 @@ +@article{Schiffels2016, + title = Test +} + +@article{TestPaper1, + title = TestPaper +} + +@book{TestBook1, + title = TestBook +} diff --git a/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/.DS_Store b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/.DS_Store differ diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Wang_2020/POSEIDON_otherPlinkEncoding.yml b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/POSEIDON.yml similarity index 90% rename from test/PoseidonGoldenTests/GoldenTestData/chronicle/Wang_2020/POSEIDON_otherPlinkEncoding.yml rename to test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/POSEIDON.yml index efae14e9..1e137405 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Wang_2020/POSEIDON_otherPlinkEncoding.yml +++ b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/POSEIDON.yml @@ -11,7 +11,7 @@ genotypeData: format: PLINK genoFile: Wang_2020.bed snpFile: Wang_2020.bim - indFile: Wang_2020_otherPlinkEncoding.fam + indFile: Wang_2020.fam snpSet: Other jannoFile: Wang_2020.janno sequencingSourceFile: Wang_2020.ssf \ No newline at end of file diff --git a/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.bed b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.bed new file mode 100644 index 00000000..b75a5356 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.bed @@ -0,0 +1 @@ +lê«‹¨èª/¨è«¯ª « \ No newline at end of file diff --git a/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.bim b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.bim new file mode 100644 index 00000000..d6fa8180 --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.bim @@ -0,0 +1,7 @@ +11 rs0000 0.000000 0 A C +11 rs1111 0.001000 100000 A G +11 rs2222 0.002000 200000 A T +11 rs3333 0.003000 300000 C A +11 rs4444 0.004000 400000 G A +11 rs5555 0.005000 500000 T A +11 rs6666 0.006000 600000 G T diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/Wang_2020/Wang_2020_otherPlinkEncoding.fam b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.fam similarity index 100% rename from test/PoseidonGoldenTests/GoldenTestData/chronicle/Wang_2020/Wang_2020_otherPlinkEncoding.fam rename to test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.fam diff --git a/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.janno b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.janno new file mode 100755 index 00000000..28ef9e1e --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.janno @@ -0,0 +1,6 @@ +Poseidon_ID Group_Name Genetic_Sex Publication +SAMPLE0 1 F n/a +SAMPLE1 2 M TestPaper1 +SAMPLE2 3 F Wang2020;TestPaper1 +SAMPLE3 4 M Wang2020;TestBook2 +SAMPLE4 5 F Wang2020;TestPaper1;TestBook2 diff --git a/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.ssf b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.ssf new file mode 100644 index 00000000..06b2766c --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/Wang_2020.ssf @@ -0,0 +1,2 @@ +poseidon_IDs run_accession other_info_1 other_info_2 +SAMPLE1 ERR3518154 A B diff --git a/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/sources.bib b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/sources.bib new file mode 100644 index 00000000..93693fbf --- /dev/null +++ b/test/testDat/testPackages/other_test_packages/Wang_2020_otherPlinkEncoding/sources.bib @@ -0,0 +1,11 @@ +@article{Wang2020, + title = Test +} + +@article{TestPaper1, + title = TestPaper +} + +@book{TestBook2, + title = TestBook +}