From dade6695251f116435e080f31a061831726d316d Mon Sep 17 00:00:00 2001 From: jke000 Date: Wed, 13 Nov 2024 17:17:29 -0800 Subject: [PATCH 01/18] intermediate commit to add back peptide index support --- CometSearch/CometDataInternal.h | 15 ++++--- CometSearch/CometFragmentIndex.cpp | 12 ++--- CometSearch/CometInterfaces.h | 1 - CometSearch/CometMassSpecUtils.cpp | 8 ++-- CometSearch/CometPostAnalysis.cpp | 2 +- CometSearch/CometPreprocess.cpp | 6 +-- CometSearch/CometSearch.cpp | 65 +++++++++++++++++++-------- CometSearch/CometSearchManager.cpp | 69 +++++++++++++---------------- CometSearch/CometSearchManager.h | 1 - CometSearch/CometWriteMzIdentML.cpp | 2 +- CometSearch/CometWriteOut.cpp | 2 +- CometSearch/Makefile | 10 +++-- Makefile | 4 +- 13 files changed, 112 insertions(+), 85 deletions(-) diff --git a/CometSearch/CometDataInternal.h b/CometSearch/CometDataInternal.h index d6212171..097688d0 100644 --- a/CometSearch/CometDataInternal.h +++ b/CometSearch/CometDataInternal.h @@ -134,7 +134,8 @@ struct Options int bSkipAlreadyDone; // 0=search everything; 1=don't re-search if .out exists int bMango; // 0=normal; 1=Mango x-link ms2 input int bScaleFragmentNL; // 0=no; 1=scale fragment NL for each modified residue contained in fragment - int bCreateIndex; // 0=normal search; 1=create peptide index file + int bCreateFragmentIndex; // 0=normal search; 1=create fragment ion index file + int bCreatePeptideIndex; // 0=normal search; 1=create peptide index file; only one of bCreateFragmentIndex and bCreatePeptideIndex can be 1 int bVerboseOutput; int bShowFragmentIons; int bExplicitDeltaCn; // if set to 1, do not use sequence similarity logic @@ -194,7 +195,8 @@ struct Options bSkipAlreadyDone = a.bSkipAlreadyDone; bMango = a.bMango; bScaleFragmentNL = a.bScaleFragmentNL; - bCreateIndex = a.bCreateIndex; + bCreatePeptideIndex = a.bCreatePeptideIndex; + bCreateFragmentIndex = a.bCreateFragmentIndex; bVerboseOutput = a.bVerboseOutput; bShowFragmentIons = a.bShowFragmentIons; bExplicitDeltaCn = a.bExplicitDeltaCn; @@ -712,7 +714,7 @@ struct StaticParams double dOneMinusBinOffset; // this is used in BIN() many times so calculate once IonInfo ionInformation; int iXcorrProcessingOffset; - int bIndexDb; // 0 = normal fasta; 1 = indexed database + int iIndexDb; // 0 = normal fasta; 1 = fragment ion indexed; 2 = peptide index vector vectorMassOffsets; vector precursorNLIons; int iPrecursorNLSize; @@ -767,7 +769,7 @@ struct StaticParams szMod[0] = '\0'; iXcorrProcessingOffset = 75; - bIndexDb = 0; + iIndexDb = 0; databaseInfo.szDatabase[0] = '\0'; @@ -882,7 +884,8 @@ struct StaticParams options.bSkipAlreadyDone = 1; options.bMango = 0; options.bScaleFragmentNL = 0; - options.bCreateIndex = 0; + options.bCreatePeptideIndex = 0; + options.bCreateFragmentIndex = 0; options.bVerboseOutput = 0; options.iDecoySearch = 0; options.iNumThreads = 4; @@ -949,7 +952,7 @@ extern StaticParams g_staticParams; extern string g_psGITHUB_SHA; // grab the GITHUB_SHA environment variable and trim to 7 chars; null if environment variable not present -extern vector g_pvDBIndex; +extern vector g_pvDBIndex; // used in both peptide index and fragment ion index; latter to store plain peptides extern vector> g_pvProteinsList; diff --git a/CometSearch/CometFragmentIndex.cpp b/CometSearch/CometFragmentIndex.cpp index 9a87d4a5..aa15e2d4 100644 --- a/CometSearch/CometFragmentIndex.cpp +++ b/CometSearch/CometFragmentIndex.cpp @@ -671,15 +671,15 @@ bool CometFragmentIndex::WritePlainPeptideIndex(ThreadPool *tp) if (bSucceeded) { - g_staticParams.options.bCreateIndex = true; - g_staticParams.bIndexDb = false; + g_staticParams.options.bCreateFragmentIndex = true; + g_staticParams.iIndexDb = 0; // this step calls RunSearch just to pull out all peptides // to write into the .idx pepties/proteins file bSucceeded = CometSearch::RunSearch(0, 0, tp); - g_staticParams.options.bCreateIndex = false; - g_staticParams.bIndexDb = true; + g_staticParams.options.bCreateFragmentIndex = false; + g_staticParams.iIndexDb = 1; } if (bSwapIdxExtension) @@ -767,7 +767,7 @@ bool CometFragmentIndex::WritePlainPeptideIndex(ThreadPool *tp) cout << " - write peptides/proteins to file" << endl; // write out index header - fprintf(fp, "Comet peptide index. Comet version %s\n", g_sCometVersion.c_str()); + fprintf(fp, "Comet fragment ion index plain peptides. Comet version %s\n", g_sCometVersion.c_str()); fprintf(fp, "InputDB: %s\n", g_staticParams.databaseInfo.szDatabase); fprintf(fp, "MassRange: %lf %lf\n", g_staticParams.options.dPeptideMassLow, g_staticParams.options.dPeptideMassHigh); fprintf(fp, "LengthRange: %d %d\n", g_staticParams.options.peptideLengthRange.iStart, g_staticParams.options.peptideLengthRange.iEnd); @@ -893,7 +893,7 @@ bool CometFragmentIndex::ReadPlainPeptideIndex(void) if (g_bPlainPeptideIndexRead) return 1; - if (g_staticParams.options.bCreateIndex && !strstr(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) + if (g_staticParams.options.bCreateFragmentIndex && !strstr(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) strIndexFile = g_staticParams.databaseInfo.szDatabase + string(".idx"); else // database already is .idx strIndexFile = g_staticParams.databaseInfo.szDatabase; diff --git a/CometSearch/CometInterfaces.h b/CometSearch/CometInterfaces.h index c827c06d..c3d53a4d 100644 --- a/CometSearch/CometInterfaces.h +++ b/CometSearch/CometInterfaces.h @@ -29,7 +29,6 @@ namespace CometInterfaces { public: virtual ~ICometSearchManager() {} - virtual bool CreateIndex() = 0; virtual bool DoSearch() = 0; virtual bool InitializeSingleSpectrumSearch() = 0; virtual void FinalizeSingleSpectrumSearch() = 0; diff --git a/CometSearch/CometMassSpecUtils.cpp b/CometSearch/CometMassSpecUtils.cpp index f9ce6f27..e2624b3e 100644 --- a/CometSearch/CometMassSpecUtils.cpp +++ b/CometSearch/CometMassSpecUtils.cpp @@ -137,7 +137,7 @@ void CometMassSpecUtils::GetProteinName(FILE *fpdb, comet_fseek(fpdb, lFilePosition, SEEK_SET); - if (g_staticParams.bIndexDb) //index database + if (g_staticParams.iIndexDb) //fragment ion or peptide index { long lSize; @@ -173,7 +173,7 @@ void CometMassSpecUtils::GetProteinSequence(FILE *fpdb, { strSeq.clear(); - if (!g_staticParams.bIndexDb) // works only for regular FASTA + if (!g_staticParams.iIndexDb) // works only for regular FASTA { int iTmpCh; @@ -220,7 +220,7 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, int iLenDecoyPrefix = (int)strlen(g_staticParams.szDecoyPrefix); - if (g_staticParams.bIndexDb) //index database + if (g_staticParams.iIndexDb) //fragment ion or peptide index { Results *pOutput; @@ -334,7 +334,7 @@ void CometMassSpecUtils::GetPrevNextAA(FILE *fpdb, int iPrintTargetDecoy, // 0 = target+decoys, 1=target only, 2=decoy only int iWhichTerm) // 0=no term constraint, 1=protein N-term, 2=protein C-term { - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) // fragment ion or peptide index { Results *pOutput; int iTmpCh = 0; diff --git a/CometSearch/CometPostAnalysis.cpp b/CometSearch/CometPostAnalysis.cpp index e9bd5839..cd93ee33 100644 --- a/CometSearch/CometPostAnalysis.cpp +++ b/CometSearch/CometPostAnalysis.cpp @@ -330,7 +330,7 @@ void CometPostAnalysis::CalculateSP(Results *pOutput, for (i = 0; i < iSize; ++i) { - if (!g_staticParams.bIndexDb) + if (!g_staticParams.iIndexDb) { // hijack here to make protein vector unique if (pOutput[i].pWhichProtein.size() > 1) diff --git a/CometSearch/CometPreprocess.cpp b/CometSearch/CometPreprocess.cpp index a0919a6b..fd3c7091 100644 --- a/CometSearch/CometPreprocess.cpp +++ b/CometSearch/CometPreprocess.cpp @@ -1611,7 +1611,7 @@ bool CometPreprocess::LoadIons(struct Query *pScoring, int iNumFragmentPeaks = 0; - if (g_staticParams.bIndexDb && mstSpectrum.size() > FRAGINDEX_MAX_NUMPEAKS) + if (g_staticParams.iIndexDb && mstSpectrum.size() > FRAGINDEX_MAX_NUMPEAKS) { // sorts spectrum in ascending order by intensity mstSpectrum.sortIntensity(); @@ -1628,7 +1628,7 @@ bool CometPreprocess::LoadIons(struct Query *pScoring, if (dIntensity >= dIntensityCutoff && dIntensity > 0.0) { - if (g_staticParams.bIndexDb && iNumFragmentPeaks < FRAGINDEX_MAX_NUMPEAKS) + if (g_staticParams.iIndexDb && iNumFragmentPeaks < FRAGINDEX_MAX_NUMPEAKS) { // Store list of fragment masses for fragment index search // Intensities don't matter here @@ -2060,7 +2060,7 @@ bool CometPreprocess::PreprocessSingleSpectrum(int iPrecursorCharge, if (bPass) { - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) pScoring->vdRawFragmentPeakMass.push_back(dIon); if (dIon < (pScoring->_pepMassInfo.dExpPepMass + 50.0)) diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index ce27e179..f12071c6 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -125,7 +125,7 @@ bool CometSearch::RunSearch(int iPercentStart, { bool bSucceeded = true; - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb == 1) { CometFragmentIndex sqFI; CometSearch sqSearch; @@ -160,6 +160,10 @@ bool CometSearch::RunSearch(int iPercentStart, } return bSucceeded; + } + else if (g_staticParams.iIndexDb == 2) + { + } else { @@ -241,7 +245,7 @@ bool CometSearch::RunSearch(int iPercentStart, } } - if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.options.bCreateIndex) + if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.options.bCreatePeptideIndex && !g_staticParams.options.bCreateFragmentIndex) { logout(" - Search progress: "); fflush(stdout); @@ -800,7 +804,7 @@ bool CometSearch::RunSearch(int iPercentStart, { char szTmp[128]; lCurrPos = ftell(fp); - if (g_staticParams.options.bCreateIndex) + if (g_staticParams.options.bCreatePeptideIndex || g_staticParams.options.bCreateFragmentIndex) sprintf(szTmp, "%3d%%", (int)(100.0*(0.005 + (double)lCurrPos/(double)lEndPos))); else // go from iPercentStart to iPercentEnd, scaled by lCurrPos/iEndPos sprintf(szTmp, "%3d%%", (int)(((iPercentStart + ((double)iPercentEnd-iPercentStart)*(double)lCurrPos/(double)lEndPos) ))); @@ -836,7 +840,7 @@ bool CometSearch::RunSearch(int iPercentStart, if (!g_staticParams.options.bOutputSqtStream) { char szTmp[128]; - if (g_staticParams.options.bCreateIndex) + if (g_staticParams.options.bCreatePeptideIndex || g_staticParams.options.bCreateFragmentIndex) sprintf(szTmp, "100%%\n"); else sprintf(szTmp, "%3d%%\n", iPercentEnd); @@ -1729,7 +1733,8 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, if (iLenPeptide <= g_staticParams.options.peptideLengthRange.iEnd) { - if (g_staticParams.options.bCreateIndex) // && !g_staticParams.variableModParameters.bRequireVarMod) + if ((g_staticParams.options.bCreatePeptideIndex && !g_staticParams.variableModParameters.iRequireVarMod) + || g_staticParams.options.bCreateFragmentIndex) { int iPepLen = iEndPos - iStartPos + 1; @@ -1916,7 +1921,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, char szDecoyPeptide[MAX_PEPTIDE_LEN_P2]; // Allow for prev/next AA in string. // Calculate ion series just once to compare against all relevant query spectra. - if (bFirstTimeThroughLoopForPeptide && !g_staticParams.options.bCreateIndex) + if (bFirstTimeThroughLoopForPeptide && !(g_staticParams.options.bCreatePeptideIndex || g_staticParams.options.bCreateFragmentIndex)) { int iLenMinus1 = iEndPos - iStartPos; // Equals iLenPeptide minus 1. double dBion = g_staticParams.precalcMasses.dNtermProton; @@ -2186,7 +2191,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, { dCalcPepMass += (double)g_staticParams.massUtility.pdAAMassParent[(int)szProteinSeq[iEndPos]]; - if (g_staticParams.variableModParameters.bVarModSearch && !g_staticParams.options.bCreateIndex) + if (g_staticParams.variableModParameters.bVarModSearch && !g_staticParams.options.bCreateFragmentIndex) CountVarMods(piVarModCounts, szProteinSeq[iEndPos], iEndPos); if (iEndPos == iProteinSeqLengthMinus1) @@ -2197,13 +2202,13 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, else if (dCalcPepMass > g_massRange.dMaxMass || iEndPos==iProteinSeqLengthMinus1 || iLenPeptide == g_staticParams.options.peptideLengthRange.iEnd) { // Run variable mod search before incrementing iStartPos. - if (g_staticParams.variableModParameters.bVarModSearch && !g_staticParams.options.bCreateIndex) + if (g_staticParams.variableModParameters.bVarModSearch && !g_staticParams.options.bCreateFragmentIndex) { // If any variable mod mass is negative, consider adding to iEndPos as long // as peptide minus all possible negative mods is less than the dMaxMass???? // // Otherwise, at this point, peptide mass is too big which means should be ok for varmod search. - if (!g_staticParams.options.bCreateIndex && HasVariableMod(piVarModCounts, iStartPos, iEndPos, &dbe)) + if (HasVariableMod(piVarModCounts, iStartPos, iEndPos, &dbe)) { // if variable mod protein filter applied, set residue mod count to 0 for the // particular variable mod if current protein not on the protein filter list @@ -2223,7 +2228,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, VariableModSearch(szProteinSeq, piVarModCounts, iStartPos, iEndPos, iClipNtermMetOffset, pbDuplFragment, &dbe); } - if (!g_staticParams.options.bCreateIndex && g_massRange.bNarrowMassRange) + if (g_massRange.bNarrowMassRange) SubtractVarMods(piVarModCounts, szProteinSeq[iStartPos], iStartPos); } @@ -2249,7 +2254,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, { dCalcPepMass -= (double)g_staticParams.massUtility.pdAAMassParent[(int)szProteinSeq[iEndPos]]; - if (g_staticParams.variableModParameters.bVarModSearch && !g_staticParams.options.bCreateIndex) + if (g_staticParams.variableModParameters.bVarModSearch && !g_staticParams.options.bCreateFragmentIndex) SubtractVarMods(piVarModCounts, szProteinSeq[iEndPos], iEndPos); if (iEndPos == iProteinSeqLengthMinus1) dCalcPepMass -= g_staticParams.staticModifications.dAddCterminusProtein; @@ -2263,7 +2268,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, dCalcPepMass = g_staticParams.precalcMasses.dOH2ProtonCtermNterm + g_staticParams.massUtility.pdAAMassParent[(int)szProteinSeq[iStartPos]]; - if (g_staticParams.variableModParameters.bVarModSearch && !g_staticParams.options.bCreateIndex) + if (g_staticParams.variableModParameters.bVarModSearch && !g_staticParams.options.bCreateFragmentIndex) { for (int x = 0; x < VMODS; ++x) //reset variable mod counts piVarModCounts[x] = 0; @@ -2438,7 +2443,7 @@ int CometSearch::WithinMassTolerance(double dCalcPepMass, && CheckEnzymeTermini(szProteinSeq, iStartPos, iEndPos)) { // if creating indexed database, only care of peptide is within global mass range - if (g_staticParams.options.bCreateIndex) + if (g_staticParams.options.bCreatePeptideIndex || g_staticParams.options.bCreateFragmentIndex) { return 1; } @@ -3325,7 +3330,7 @@ void CometSearch::XcorrScore(char *szProteinSeq, if (dXcorr + 0.00005 >= dLowestXcorrScore && iLenPeptide <= g_staticParams.options.peptideLengthRange.iEnd) { // no need to check duplicates if indexed database search and !g_staticParams.options.bTreatSameIL and no internal decoys - if (g_staticParams.bIndexDb && !g_staticParams.options.bTreatSameIL) + if (g_staticParams.iIndexDb && !g_staticParams.options.bTreatSameIL) { StorePeptide(iWhichQuery, iStartResidue, iStartPos, iEndPos, iFoundVariableMod, szProteinSeq, dCalcPepMass, dXcorr, bDecoyPep, piVarModSites, dbe); @@ -3622,7 +3627,7 @@ void CometSearch::StorePeptide(int iWhichQuery, pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].fXcorr = (float)dXcorr; - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) { pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cPrevAA = _proteinInfo.cPrevAA; pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cNextAA = _proteinInfo.cNextAA; @@ -3822,7 +3827,7 @@ void CometSearch::StorePeptide(int iWhichQuery, pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr = (float)dXcorr; - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) { pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = _proteinInfo.cPrevAA; pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = _proteinInfo.cNextAA; @@ -5821,7 +5826,7 @@ bool CometSearch::MergeVarMods(char *szProteinSeq, // add each single PEFF mod to these existing variable mods. // Now that normal variable mods are taken care of, add in PEFF mods if pertinent - if (*bDoPeffAnalysis && !g_staticParams.options.bCreateIndex) + if (*bDoPeffAnalysis && !g_staticParams.options.bCreatePeptideIndex) { int piTmpVarModSites[MAX_PEPTIDE_LEN_P2]; memcpy(piTmpVarModSites, piVarModSites, _iSizepiVarModSites); @@ -5926,7 +5931,31 @@ bool CometSearch::MergeVarMods(char *szProteinSeq, if (bHasVarMod) { - CalcVarModIons(szProteinSeq, iWhichQuery, pbDuplFragment, piVarModSites, dCalcPepMass, iLenPeptide, dbe); + if (g_staticParams.options.bCreatePeptideIndex) + { + Threading::LockMutex(g_pvQueryMutex); + + // add to DBIndex vector + DBIndex sDBTmp; + sDBTmp.dPepMass = dCalcPepMass; //MH+ mass + strncpy(sDBTmp.szPeptide, szProteinSeq + _varModInfo.iStartPos, iLenPeptide); + sDBTmp.szPeptide[iLenPeptide]='\0'; + + sDBTmp.lIndexProteinFilePosition = _proteinInfo.lProteinFilePosition; + + memset(sDBTmp.pcVarModSites, 0, sizeof(sDBTmp.pcVarModSites)); + + for (int x=0; x g_staticParams.variableModParameters.iMaxVarModPerPeptide) g_staticParams.variableModParameters.varModList[i].iMaxNumVarModAAPerMod = g_staticParams.variableModParameters.iMaxVarModPerPeptide; - if (g_staticParams.options.bCreateIndex) + if (g_staticParams.options.bCreateFragmentIndex) { // limit any user specified modification limits to the max supported by fragment ion indexing if (g_staticParams.variableModParameters.varModList[i].iMaxNumVarModAAPerMod > FRAGINDEX_MAX_MODS_PER_MOD) g_staticParams.variableModParameters.varModList[i].iMaxNumVarModAAPerMod = FRAGINDEX_MAX_MODS_PER_MOD; @@ -1649,9 +1650,13 @@ bool CometSearchManager::InitializeStaticParams() g_staticParams.options.iFragIndexNumThreads = (g_staticParams.options.iNumThreads > FRAGINDEX_MAX_THREADS ? FRAGINDEX_MAX_THREADS : g_staticParams.options.iNumThreads); // At this point, check extension to set whether index database or not - if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) + if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 7, ".pepidx")) + { + g_staticParams.iIndexDb = 2; // peptide index + } + else if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) { - g_staticParams.bIndexDb = 1; + g_staticParams.iIndexDb = 1; // fragment ion index // if searching fragment index database, limit load of query spectra as no // need to load all spectra into memory since querying spectra sequentially @@ -1659,7 +1664,7 @@ bool CometSearchManager::InitializeStaticParams() g_staticParams.options.iSpectrumBatchSize = FRAGINDEX_MAX_BATCHSIZE; } - if (g_staticParams.options.bCreateIndex && g_staticParams.bIndexDb) + if (g_staticParams.options.bCreateFragmentIndex && g_staticParams.iIndexDb) { char szErrorMsg[SIZE_ERROR]; sprintf(szErrorMsg, " Error - input database already indexed: \"%s\".\n", g_staticParams.databaseInfo.szDatabase); @@ -1669,7 +1674,7 @@ bool CometSearchManager::InitializeStaticParams() return false; } - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) { g_bIndexPrecursors = (bool*) malloc(BIN(g_staticParams.options.dPeptideMassHigh)); if (g_bIndexPrecursors == NULL) @@ -1984,16 +1989,6 @@ void CometSearchManager::ResetSearchStatus() } -bool CometSearchManager::CreateIndex() -{ - // Override the Create Index flag to force it to create - g_staticParams.options.bCreateIndex = true; - - // The DoSearch will create the index and exit - return DoSearch(); -} - - bool CometSearchManager::DoSearch() { string strOut; @@ -2029,7 +2024,7 @@ bool CometSearchManager::DoSearch() else g_sCometVersion = std::string(comet_version); - if (!g_staticParams.options.bOutputSqtStream) // && !g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream) // && !g_staticParams.iIndexDb) { strOut = "\n Comet version \"" + g_sCometVersion + "\"\n\n"; @@ -2037,7 +2032,7 @@ bool CometSearchManager::DoSearch() fflush(stdout); } - if (g_staticParams.options.bCreateIndex || !g_staticParams.bIndexDb) + if (g_staticParams.options.bCreateFragmentIndex || !g_staticParams.iIndexDb) { // If specified, read in the protein variable mod filter file content. // Do this here only for classic search or if creating the plain peptide index. @@ -2072,7 +2067,7 @@ bool CometSearchManager::DoSearch() tp->fillPool( g_staticParams.options.iNumThreads < 0 ? 0 : g_staticParams.options.iNumThreads-1); - if (g_staticParams.options.bCreateIndex) //index + if (g_staticParams.options.bCreateFragmentIndex) //index { // write out .idx file containing unmodified peptides and protein refs; // this calls RunSearch just to query fasta and generate uniq peptide list @@ -2091,7 +2086,7 @@ bool CometSearchManager::DoSearch() bool bBlankSearchFile = false; - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) { if (!g_staticParams.options.iFragIndexSkipReadPrecursors) { @@ -2140,7 +2135,7 @@ bool CometSearchManager::DoSearch() time(&tStartTime); strftime(g_staticParams.szDate, 26, "%m/%d/%Y, %I:%M:%S %p", localtime(&tStartTime)); - if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.iIndexDb) { strOut = " Search start: " + string(g_staticParams.szDate) + "\n"; strOut += " - Input file: " + string(g_staticParams.inputFile.szFileName) + "\n"; @@ -2539,7 +2534,7 @@ bool CometSearchManager::DoSearch() FILE *fpdb; // need FASTA file again to grab headers for output (currently just store file positions) string sTmpDB = g_staticParams.databaseInfo.szDatabase; - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) sTmpDB = sTmpDB.erase(sTmpDB.size()-4); // need plain fasta if indexdb input if ((fpdb=fopen(sTmpDB.c_str(), "r")) == NULL) { @@ -2551,7 +2546,7 @@ bool CometSearchManager::DoSearch() return false; } - if (g_staticParams.options.iSpectrumBatchSize == 0 && !g_staticParams.bIndexDb) + if (g_staticParams.options.iSpectrumBatchSize == 0 && !g_staticParams.iIndexDb) { logout(" - Reading all spectra into memory; set \"spectrum_batch_size\" if search terminates here.\n"); fflush(stdout); @@ -2559,12 +2554,12 @@ bool CometSearchManager::DoSearch() CometFragmentIndex sqSearch; - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) { if (!g_bPlainPeptideIndexRead) { auto tStartTime = chrono::steady_clock::now(); - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iIndexDb) { cout << " - read .idx ... "; fflush(stdout); @@ -2572,7 +2567,7 @@ bool CometSearchManager::DoSearch() sqSearch.ReadPlainPeptideIndex(); - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iIndexDb) { cout << CometFragmentIndex::ElapsedTime(tStartTime) << endl; } @@ -2582,7 +2577,7 @@ bool CometSearchManager::DoSearch() } auto tBeginTime = chrono::steady_clock::now(); - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) { printf(" - searching \"%s\" ... ", g_staticParams.inputFile.szBaseName); fflush(stdout); @@ -2607,7 +2602,7 @@ bool CometSearchManager::DoSearch() #endif // Load and preprocess all the spectra. - if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.iIndexDb) { logout(" - Load spectra:"); @@ -2661,7 +2656,7 @@ bool CometSearchManager::DoSearch() { // need strStatusMsg in it's own scope due to goto statement above string strStatusMsg = " " + to_string(g_pvQuery.size()) + string("\n"); - if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.iIndexDb) { logout(strStatusMsg.c_str()); } @@ -2750,7 +2745,7 @@ bool CometSearchManager::DoSearch() if (!bSucceeded) goto cleanup_results; - if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.iIndexDb) { logout(" - Post analysis:"); fflush(stdout); @@ -2783,7 +2778,7 @@ bool CometSearchManager::DoSearch() std::sort(g_pvQuery.begin(), g_pvQuery.end(), compareByScanNumber); // Get flanking amino acid residues - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) { for (int iWhichQuery = 0; iWhichQuery < (int)g_pvQuery.size(); ++iWhichQuery) { @@ -2829,7 +2824,7 @@ bool CometSearchManager::DoSearch() } } - if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.iIndexDb) { logout(" done\n"); fflush(stdout); @@ -2879,7 +2874,7 @@ bool CometSearchManager::DoSearch() break; } - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) cout << CometFragmentIndex::ElapsedTime(tBeginTime) << endl; if (bSucceeded) @@ -2935,7 +2930,7 @@ bool CometSearchManager::DoSearch() remove(szOutputDecoyMzIdentMLtmp); } - if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.bIndexDb) + if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.iIndexDb) { time_t tEndTime; @@ -3060,7 +3055,7 @@ bool CometSearchManager::DoSearch() break; } - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) { int iNumIndexingThreads = g_staticParams.options.iNumThreads; if (iNumIndexingThreads > FRAGINDEX_MAX_THREADS) diff --git a/CometSearch/CometSearchManager.h b/CometSearch/CometSearchManager.h index e3889759..f5466824 100644 --- a/CometSearch/CometSearchManager.h +++ b/CometSearch/CometSearchManager.h @@ -50,7 +50,6 @@ class CometSearchManager : public ICometSearchManager std::map& GetParamsMap(); // Methods inherited from ICometSearchManager - virtual bool CreateIndex(); virtual bool DoSearch(); virtual bool InitializeSingleSpectrumSearch(); virtual void FinalizeSingleSpectrumSearch(); diff --git a/CometSearch/CometWriteMzIdentML.cpp b/CometSearch/CometWriteMzIdentML.cpp index deadbb76..3a4c450b 100644 --- a/CometSearch/CometWriteMzIdentML.cpp +++ b/CometSearch/CometWriteMzIdentML.cpp @@ -296,7 +296,7 @@ bool CometWriteMzIdentML::ParseTmpFile(FILE *fpout, bool bPrintSequences = false; if (g_staticParams.options.bOutputMzIdentMLFile == 2) // print sequences in DBSequence { - if (g_staticParams.bIndexDb) + if (g_staticParams.iIndexDb) bPrintSequences = false; else bPrintSequences = true; diff --git a/CometSearch/CometWriteOut.cpp b/CometSearch/CometWriteOut.cpp index 61c5797f..cbd377b8 100644 --- a/CometSearch/CometWriteOut.cpp +++ b/CometSearch/CometWriteOut.cpp @@ -251,7 +251,7 @@ bool CometWriteOut::PrintResults(int iWhichQuery, } } - if (g_staticParams.bIndexDb) //index database + if (g_staticParams.iIndexDb) //index database { uiNumTotProteins = (unsigned int)g_pvProteinsList.at(pOutput[i].lProteinFilePosition).size(); } diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 68f4dd44..f611e93d 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,15 +14,15 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -static -std=c++14 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 else - override CXXFLAGS += -O3 -static -std=c++14 -fconcepts -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static -fconcepts -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 endif COMETSEARCH = Threading.o CometInterfaces.o CometSearch.o CometPreprocess.o CometPostAnalysis.o CometMassSpecUtils.o CometWriteOut.o\ CometWriteSqt.o CometWritePepXML.o CometWriteMzIdentML.o CometWritePercolator.o CometWriteTxt.o CometSearchManager.o\ - CombinatoricsUtils.o ModificationsPermuter.o CometFragmentIndex.o + CombinatoricsUtils.o ModificationsPermuter.o CometFragmentIndex.o CometPeptideIndex.o all: $(COMETSEARCH) @@ -33,7 +33,7 @@ clean: Threading.o: Threading.cpp Threading.h ${CXX} ${CXXFLAGS} Threading.cpp -c -CometSearch.o: CometSearch.cpp Common.h CometData.h CometDataInternal.h CometSearch.h CometInterfaces.h ThreadPool.h CometFragmentIndex.h +CometSearch.o: CometSearch.cpp Common.h CometData.h CometDataInternal.h CometSearch.h CometInterfaces.h ThreadPool.h CometFragmentIndex.h CometPeptideIndex.h ${CXX} ${CXXFLAGS} CometSearch.cpp -c CometPreprocess.o: CometPreprocess.cpp Common.h CometData.h CometDataInternal.h CometPreprocess.h CometInterfaces.h $(MSTPATH) ${CXX} ${CXXFLAGS} CometPreprocess.cpp -c @@ -65,3 +65,5 @@ ModificationsPermuter.o: ModificationsPermuter.cpp ModificationsPermuter.h Com ${CXX} ${CXXFLAGS} ModificationsPermuter.cpp -c CometFragmentIndex.o: CometFragmentIndex.cpp Common.h CometData.h CometDataInternal.h CometSearch.h CometInterfaces.h ThreadPool.h ${CXX} ${CXXFLAGS} CometFragmentIndex.cpp -c +CometPeptideIndex.o: CometPeptideIndex.cpp Common.h CometData.h CometDataInternal.h CometSearch.h CometInterfaces.h ThreadPool.h + ${CXX} ${CXXFLAGS} CometPeptideIndex.cpp -c diff --git a/Makefile b/Makefile index c410a622..25fbcf3b 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,9 @@ COMETSEARCH = CometSearch UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -std=c++14 -fpermissive -Wall -Wextra -Wno-char-subscripts -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/src/expat-2.2.9/lib -I$(MSTOOLKIT)/src/zlib-1.2.11 -I$(COMETSEARCH) + override CXXFLAGS += -O3 -fpermissive -Wall -Wextra -Wno-char-subscripts -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/src/expat-2.2.9/lib -I$(MSTOOLKIT)/src/zlib-1.2.11 -I$(COMETSEARCH) else - override CXXFLAGS += -O3 -static -std=c++14 -fpermissive -Wall -Wextra -Wno-char-subscripts -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/src/expat-2.2.9/lib -I$(MSTOOLKIT)/src/zlib-1.2.11 -I$(COMETSEARCH) + override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-char-subscripts -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/src/expat-2.2.9/lib -I$(MSTOOLKIT)/src/zlib-1.2.11 -I$(COMETSEARCH) endif EXECNAME = comet.exe From 2801dea9f78f443fe0cef70bf26862397a549911 Mon Sep 17 00:00:00 2001 From: jke000 Date: Wed, 11 Dec 2024 11:53:16 -0800 Subject: [PATCH 02/18] another intermediate update; peptide index creation dies with more than 1 thread --- Comet.cpp | 13 +- CometSearch/CometCheckForUpdates.h | 2 +- CometSearch/CometDataInternal.h | 2 + CometSearch/CometFragmentIndex.cpp | 7 +- CometSearch/CometFragmentIndex.h | 9 +- CometSearch/CometMassSpecUtils.cpp | 44 +- CometSearch/CometPostAnalysis.cpp | 2 - CometSearch/CometPostAnalysis.h | 1 + CometSearch/CometPreprocess.h | 1 - CometSearch/CometSearch.cpp | 847 +++++++++++++++++++++++++- CometSearch/CometSearch.h | 19 +- CometSearch/CometSearchManager.cpp | 110 +++- CometSearch/Common.h | 2 +- CometSearch/ModificationsPermuter.cpp | 5 +- CometSearch/ModificationsPermuter.h | 3 - Makefile | 3 +- 16 files changed, 970 insertions(+), 100 deletions(-) diff --git a/Comet.cpp b/Comet.cpp index 1b769b9d..acb66055 100644 --- a/Comet.cpp +++ b/Comet.cpp @@ -100,7 +100,8 @@ void Usage(char *pszCmd) logout(" -F to specify the first/start scan to search, overriding entry in parameters file\n"); logout(" -L to specify the last/end scan to search, overriding entry in parameters file\n"); logout(" (-L option is required if -F option is used)\n"); - logout(" -i create peptide index file only (specify .idx file as database for index search)\n"); + logout(" -i create .idx file for fragment ion indexing\n"); + logout(" -j create .idx file for peptide indexing\n"); logout("\n"); sprintf(szTmp, " example: %s file1.mzXML file2.mzXML\n", pszCmd); logout(szTmp); @@ -197,7 +198,15 @@ void SetOptions(char *arg, break; case 'i': sprintf(szParamStringVal, "1"); - pSearchMgr->SetParam("create_index", szParamStringVal, 1); + pSearchMgr->SetParam("create_fragment_index", szParamStringVal, 1); + sprintf(szParamStringVal, "0"); + pSearchMgr->SetParam("create_peptide_index", szParamStringVal, 0); + break; + case 'j': + sprintf(szParamStringVal, "0"); + pSearchMgr->SetParam("create_fragment_index", szParamStringVal, 0); + sprintf(szParamStringVal, "1"); + pSearchMgr->SetParam("create_peptide_index", szParamStringVal, 1); break; default: break; diff --git a/CometSearch/CometCheckForUpdates.h b/CometSearch/CometCheckForUpdates.h index 5fa2941c..349e6a2f 100644 --- a/CometSearch/CometCheckForUpdates.h +++ b/CometSearch/CometCheckForUpdates.h @@ -17,7 +17,7 @@ #define _COMETCHECKFORUPDATES_H_ #include "Common.h" -#include "CometDataInternal.h" +//#include "CometDataInternal.h" #include #include diff --git a/CometSearch/CometDataInternal.h b/CometSearch/CometDataInternal.h index 097688d0..0ebb3736 100644 --- a/CometSearch/CometDataInternal.h +++ b/CometSearch/CometDataInternal.h @@ -720,6 +720,7 @@ struct StaticParams int iPrecursorNLSize; int iOldModsEncoding; bool bSkipToStartScan; + std::chrono::high_resolution_clock::time_point tRealTimeStart; // track run time of real-time index search StaticParams() { @@ -975,6 +976,7 @@ extern int* PEPTIDE_MOD_SEQ_IDXS; extern int MOD_NUM; extern bool g_bPlainPeptideIndexRead; // set to true if plain peptide index file is read (and fragment index generated) +extern bool g_bPeptideIndexRead; // set to true if peptide index file is read // Query stores information for peptide scoring and results // This struct is allocated for each spectrum/charge combination diff --git a/CometSearch/CometFragmentIndex.cpp b/CometSearch/CometFragmentIndex.cpp index aa15e2d4..6809f364 100644 --- a/CometSearch/CometFragmentIndex.cpp +++ b/CometSearch/CometFragmentIndex.cpp @@ -13,12 +13,10 @@ // limitations under the License. -#include "Common.h" #include "CometFragmentIndex.h" #include "CometSearch.h" #include "ThreadPool.h" #include "CometStatus.h" -//#include "CometPostAnalysis.h" #include "CometMassSpecUtils.h" #include "ModificationsPermuter.h" @@ -37,7 +35,7 @@ int MOD_NUM = 0; Mutex CometFragmentIndex::_vFragmentPeptidesMutex; -//comet_fileoffset_t clSizeCometFileOffset; + #ifdef _WIN32 #ifdef _WIN64 comet_fileoffset_t clSizeCometFileOffset = sizeof(comet_fileoffset_t); //win64 @@ -48,6 +46,7 @@ comet_fileoffset_t clSizeCometFileOffset = (long long)sizeof(comet_fileoffset_t) comet_fileoffset_t clSizeCometFileOffset = sizeof(comet_fileoffset_t); //linux #endif + CometFragmentIndex::CometFragmentIndex() { } @@ -652,7 +651,7 @@ bool CometFragmentIndex::WritePlainPeptideIndex(ThreadPool *tp) exit(1); } - strOut = " Creating plain peptide/protein index file:\n"; + strOut = " Creating plain peptide/protein index file for fragment ion indexing:\n"; logout(strOut.c_str()); fflush(stdout); strOut = " - parse peptides from database ... "; diff --git a/CometSearch/CometFragmentIndex.h b/CometSearch/CometFragmentIndex.h index eb94de7f..c457762f 100644 --- a/CometSearch/CometFragmentIndex.h +++ b/CometSearch/CometFragmentIndex.h @@ -17,7 +17,6 @@ #define _COMETFRAGMENTINDEX_H_ #include "Common.h" -#include "CometDataInternal.h" #include "CometSearch.h" #include @@ -32,6 +31,10 @@ class CometFragmentIndex static bool CreateFragmentIndex(ThreadPool *tp); static string ElapsedTime(std::chrono::time_point tStartTime); static int WhichPrecursorBin(double dMass); + static bool CompareByPeptide(const DBIndex &lhs, + const DBIndex &rhs); + static bool CompareByMass(const DBIndex &lhs, + const DBIndex &rhs); private: @@ -52,10 +55,6 @@ class CometFragmentIndex unsigned int y); static void SortFragmentThreadProc(int iWhichThread, ThreadPool* tp); - static bool CompareByPeptide(const DBIndex &lhs, - const DBIndex &rhs); - static bool CompareByMass(const DBIndex &lhs, - const DBIndex &rhs); unsigned int _uiBinnedIonMasses[MAX_FRAGMENT_CHARGE + 1][NUM_ION_SERIES][MAX_PEPTIDE_LEN][VMODS + 1]; unsigned int _uiBinnedIonMassesDecoy[MAX_FRAGMENT_CHARGE + 1][NUM_ION_SERIES][MAX_PEPTIDE_LEN][VMODS + 1]; diff --git a/CometSearch/CometMassSpecUtils.cpp b/CometSearch/CometMassSpecUtils.cpp index e2624b3e..a51eaed2 100644 --- a/CometSearch/CometMassSpecUtils.cpp +++ b/CometSearch/CometMassSpecUtils.cpp @@ -129,45 +129,45 @@ void CometMassSpecUtils::AssignMass(double *pdAAMass, // return a single protein name as a C char string -void CometMassSpecUtils::GetProteinName(FILE *fpdb, +void CometMassSpecUtils::GetProteinName(FILE *fpfasta, comet_fileoffset_t lFilePosition, char *szProteinName) { size_t tTmp; - comet_fseek(fpdb, lFilePosition, SEEK_SET); + comet_fseek(fpfasta, lFilePosition, SEEK_SET); if (g_staticParams.iIndexDb) //fragment ion or peptide index { long lSize; - tTmp = fread(&lSize, sizeof(long), 1, fpdb); + tTmp = fread(&lSize, sizeof(long), 1, fpfasta); vector vOffsets; for (long x = 0; x < lSize; ++x) // read file offsets { comet_fileoffset_t tmpoffset; - tTmp = fread(&tmpoffset, sizeof(comet_fileoffset_t), 1, fpdb); + tTmp = fread(&tmpoffset, sizeof(comet_fileoffset_t), 1, fpfasta); vOffsets.push_back(tmpoffset); } for (long x = 0; x < lSize; ++x) // read name from fasta { char szTmp[WIDTH_REFERENCE]; - comet_fseek(fpdb, vOffsets.at(x), SEEK_SET); - tTmp = fread(szTmp, sizeof(char)*WIDTH_REFERENCE, 1, fpdb); + comet_fseek(fpfasta, vOffsets.at(x), SEEK_SET); + tTmp = fread(szTmp, sizeof(char)*WIDTH_REFERENCE, 1, fpfasta); sscanf(szTmp, "%511s", szProteinName); // WIDTH_REFERENCE-1 break; //break here to only get first protein reference (out of lSize) } } else //regular fasta database { - fscanf(fpdb, "%511s", szProteinName); // WIDTH_REFERENCE-1 + fscanf(fpfasta, "%511s", szProteinName); // WIDTH_REFERENCE-1 szProteinName[511] = '\0'; } } // return a single protein sequence as C++ string -void CometMassSpecUtils::GetProteinSequence(FILE *fpdb, +void CometMassSpecUtils::GetProteinSequence(FILE *fpfasta, comet_fileoffset_t lFilePosition, string &strSeq) { @@ -177,13 +177,13 @@ void CometMassSpecUtils::GetProteinSequence(FILE *fpdb, { int iTmpCh; - comet_fseek(fpdb, lFilePosition, SEEK_SET); + comet_fseek(fpfasta, lFilePosition, SEEK_SET); // skip to end of description line - while (((iTmpCh = getc(fpdb)) != '\n') && (iTmpCh != '\r') && (iTmpCh != EOF)); + while (((iTmpCh = getc(fpfasta)) != '\n') && (iTmpCh != '\r') && (iTmpCh != EOF)); // load sequence - while (((iTmpCh=getc(fpdb)) != '>') && (iTmpCh != EOF)) + while (((iTmpCh=getc(fpfasta)) != '>') && (iTmpCh != EOF)) { if ('a'<=iTmpCh && iTmpCh<='z') { @@ -205,7 +205,7 @@ void CometMassSpecUtils::GetProteinSequence(FILE *fpdb, // return all matched protein names in a vector of strings -void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, +void CometMassSpecUtils::GetProteinNameString(FILE *fpfasta, int iWhichQuery, // which search int iWhichResult, // which peptide within the search int iPrintTargetDecoy, // 0 = target+decoys, 1=target only, 2=decoy only @@ -241,8 +241,8 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, comet_fileoffset_t lEntry = pOutput[iWhichResult].lProteinFilePosition; for (auto it = g_pvProteinsList.at(lEntry).begin(); it != g_pvProteinsList.at(lEntry).end(); ++it) { - comet_fseek(fpdb, *it, SEEK_SET); - fscanf(fpdb, "%511s", szProteinName); // WIDTH_REFERENCE-1 + comet_fseek(fpfasta, *it, SEEK_SET); + fscanf(fpfasta, "%511s", szProteinName); // WIDTH_REFERENCE-1 szProteinName[511] = '\0'; if (!strncmp(szProteinName, g_staticParams.szDecoyPrefix, iLenDecoyPrefix)) @@ -286,8 +286,8 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, { for (it=pOutput[iWhichResult].pWhichProtein.begin(); it!=pOutput[iWhichResult].pWhichProtein.end(); ++it) { - comet_fseek(fpdb, (*it).lWhichProtein, SEEK_SET); - fscanf(fpdb, "%511s", szProteinName); // WIDTH_REFERENCE-1 + comet_fseek(fpfasta, (*it).lWhichProtein, SEEK_SET); + fscanf(fpfasta, "%511s", szProteinName); // WIDTH_REFERENCE-1 szProteinName[511] = '\0'; vProteinTargets.push_back(szProteinName); @@ -309,8 +309,8 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, if (iPrintDuplicateProteinCt >= g_staticParams.options.iMaxDuplicateProteins) break; - comet_fseek(fpdb, (*it).lWhichProtein, SEEK_SET); - fscanf(fpdb, "%511s", szProteinName); // WIDTH_REFERENCE-1 + comet_fseek(fpfasta, (*it).lWhichProtein, SEEK_SET); + fscanf(fpfasta, "%511s", szProteinName); // WIDTH_REFERENCE-1 szProteinName[511] = '\0'; if (strlen(szProteinName) + iLenDecoyPrefix >= WIDTH_REFERENCE) @@ -328,7 +328,7 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, // find prev, next AA from first matched protein // this is only valid if searching indexed db with peptide/protein .idx file -void CometMassSpecUtils::GetPrevNextAA(FILE *fpdb, +void CometMassSpecUtils::GetPrevNextAA(FILE *fpfasta, int iWhichQuery, // which search int iWhichResult, // which peptide within the search int iPrintTargetDecoy, // 0 = target+decoys, 1=target only, 2=decoy only @@ -356,13 +356,13 @@ void CometMassSpecUtils::GetPrevNextAA(FILE *fpdb, { string strSeq; - comet_fseek(fpdb, *it, SEEK_SET); + comet_fseek(fpfasta, *it, SEEK_SET); // skip through protein name string to first carriage return - while (((iTmpCh = getc(fpdb)) != '\n') && (iTmpCh != '\r') && (iTmpCh != EOF)); + while (((iTmpCh = getc(fpfasta)) != '\n') && (iTmpCh != '\r') && (iTmpCh != EOF)); // Load sequence - while (((iTmpCh=getc(fpdb)) != '>') && (iTmpCh != EOF)) + while (((iTmpCh=getc(fpfasta)) != '>') && (iTmpCh != EOF)) { if ('a' <= iTmpCh && iTmpCh <= 'z') { diff --git a/CometSearch/CometPostAnalysis.cpp b/CometSearch/CometPostAnalysis.cpp index cd93ee33..65ada640 100644 --- a/CometSearch/CometPostAnalysis.cpp +++ b/CometSearch/CometPostAnalysis.cpp @@ -14,13 +14,11 @@ #include "Common.h" -#include "CometDataInternal.h" #include "ThreadPool.h" #include "CometPostAnalysis.h" #include "CometMassSpecUtils.h" #include "CometStatus.h" - #include "CometDecoys.h" // this is where decoyIons[EXPECT_DECOY_SIZE] is initialized diff --git a/CometSearch/CometPostAnalysis.h b/CometSearch/CometPostAnalysis.h index a54b934f..f2126b94 100644 --- a/CometSearch/CometPostAnalysis.h +++ b/CometSearch/CometPostAnalysis.h @@ -16,6 +16,7 @@ #ifndef _COMETPOSTANALYSIS_H_ #define _COMETPOSTANALYSIS_H_ +#include "CometDataInternal.h" struct PostAnalysisThreadData { diff --git a/CometSearch/CometPreprocess.h b/CometSearch/CometPreprocess.h index aeef823f..2f30afca 100644 --- a/CometSearch/CometPreprocess.h +++ b/CometSearch/CometPreprocess.h @@ -16,7 +16,6 @@ #ifndef _COMETPREPROCESS_H_ #define _COMETPREPROCESS_H_ -#include "Common.h" #include "ThreadPool.h" struct PreprocessThreadData diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index f12071c6..efb5118c 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -12,24 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "Common.h" #include "CometSearch.h" +#include "CometDataInternal.h" #include "ThreadPool.h" #include "CometStatus.h" #include "CometPostAnalysis.h" #include "CometMassSpecUtils.h" #include "CometFragmentIndex.h" +#include "CometPeptideIndex.h" #include "ModificationsPermuter.h" #include #include #include #include +#include + bool *CometSearch::_pbSearchMemoryPool; bool **CometSearch::_ppbDuplFragmentArr; +extern comet_fileoffset_t clSizeCometFileOffset; + + CometSearch::CometSearch() { // Initialize the header modification string - won't change. @@ -98,22 +104,36 @@ bool CometSearch::DeallocateMemory(int maxNumThreads) } - // called by DoSingleSpectrumSearch bool CometSearch::RunSearch(ThreadPool *tp) { CometFragmentIndex sqFI; CometSearch sqSearch; + size_t iWhichQuery = 0; - if (!g_bPlainPeptideIndexRead) + if (g_staticParams.iIndexDb == 1) // fragment index { - sqFI.ReadPlainPeptideIndex(); - sqFI.CreateFragmentIndex(tp); - } - - size_t iWhichQuery = 0; + if (!g_bPlainPeptideIndexRead) + { + sqFI.ReadPlainPeptideIndex(); + sqFI.CreateFragmentIndex(tp); + } - sqSearch.SearchFragmentIndex(iWhichQuery, tp); + sqSearch.SearchFragmentIndex(iWhichQuery, tp); + } + else if (g_staticParams.iIndexDb == 2) // peptide index + { + sqSearch.SearchPeptideIndex(); + } + else + { + char szErrorMsg[SIZE_ERROR]; + sprintf(szErrorMsg, " Error - index search but iIndexDb=%d\n", g_staticParams.iIndexDb); + string strErrorMsg(szErrorMsg); + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(szErrorMsg); + return false; + } return true; } @@ -163,7 +183,8 @@ bool CometSearch::RunSearch(int iPercentStart, } else if (g_staticParams.iIndexDb == 2) { - + CometSearch sqSearch; + sqSearch.SearchPeptideIndex(); } else { @@ -945,7 +966,7 @@ bool CometSearch::MapOBO(string strMod, vector *vectorPeffOBO, struct // find match of strMod in vectorPeffOBO and store diff masses in pData - iPos=BinarySearchPeffStrMod(0, (int)(*vectorPeffOBO).size(), strMod, *vectorPeffOBO); + iPos = BinarySearchPeffStrMod(0, (int)(*vectorPeffOBO).size(), strMod, *vectorPeffOBO); if (iPos != -1 && iPos< (int)(*vectorPeffOBO).size() ) { @@ -1486,7 +1507,7 @@ void CometSearch::SearchFragmentIndex(size_t iWhichQuery, // iLenPeptide-1 to complete set of internal fragment ions. for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - double dFragMass = GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, pdAAforward, pdAAreverse); + double dFragMass = CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, pdAAforward, pdAAreverse); int iVal = BIN(dFragMass); if (pbDuplFragment[iVal] == false) @@ -1548,6 +1569,782 @@ void CometSearch::SearchFragmentIndex(size_t iWhichQuery, } +bool CometSearch::SearchPeptideIndex(void) +{ + comet_fileoffset_t lEndOfStruct; + char szBuf[SIZE_BUF]; + FILE *fp; + size_t tTmp; + + CometPostAnalysis cpa; + + if ((fp = fopen(g_staticParams.databaseInfo.szDatabase, "rb")) == NULL) + { + char szErrorMsg[SIZE_ERROR]; + sprintf(szErrorMsg, " Error - cannot read indexed database file \"%s\" %s.\n", g_staticParams.databaseInfo.szDatabase, strerror(errno)); + string strErrorMsg(szErrorMsg); + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(szErrorMsg); + return false; + } + + // ignore any static masses in params file; only valid ones + // are those in database index + memset(g_staticParams.staticModifications.pdStaticMods, 0, sizeof(g_staticParams.staticModifications.pdStaticMods)); + + bool bFoundStatic = false; + bool bFoundVariable = false; + + // read in static and variable mods + while (fgets(szBuf, sizeof(szBuf), fp)) + { + if (!strncmp(szBuf, "MassType:", 9)) + { + sscanf(szBuf, "%d %d", &g_staticParams.massUtility.bMonoMassesParent, &g_staticParams.massUtility.bMonoMassesFragment); + } + + if (!strncmp(szBuf, "StaticMod:", 10)) + { + char *tok; + char delims[] = " "; + int x=65; + + // FIX: hack here for setting static mods; need to reset masses ... fix later + CometMassSpecUtils::AssignMass(g_staticParams.massUtility.pdAAMassFragment, + g_staticParams.massUtility.bMonoMassesFragment, + &g_staticParams.massUtility.dOH2fragment); + + bFoundStatic = true; + tok=strtok(szBuf+11, delims); + while (tok != NULL) + { + sscanf(tok, "%lf", &(g_staticParams.staticModifications.pdStaticMods[x])); + g_staticParams.massUtility.pdAAMassFragment[x] += g_staticParams.staticModifications.pdStaticMods[x]; + tok = strtok(NULL, delims); + x++; + if (x==95) // 65-90 stores A-Z then next 4 (ascii 91-94) are n/c term peptide, n/c term protein + break; + } + + g_staticParams.staticModifications.dAddNterminusPeptide = g_staticParams.staticModifications.pdStaticMods[91]; + g_staticParams.staticModifications.dAddCterminusPeptide = g_staticParams.staticModifications.pdStaticMods[92]; + g_staticParams.staticModifications.dAddNterminusProtein = g_staticParams.staticModifications.pdStaticMods[93]; + g_staticParams.staticModifications.dAddCterminusProtein = g_staticParams.staticModifications.pdStaticMods[94]; + + // have to set these here again once static mods are read + g_staticParams.precalcMasses.dNtermProton = g_staticParams.staticModifications.dAddNterminusPeptide + + PROTON_MASS; + + g_staticParams.precalcMasses.dCtermOH2Proton = g_staticParams.staticModifications.dAddCterminusPeptide + + g_staticParams.massUtility.dOH2fragment + + PROTON_MASS; + + g_staticParams.precalcMasses.dOH2ProtonCtermNterm = g_staticParams.massUtility.dOH2parent + + PROTON_MASS + + g_staticParams.staticModifications.dAddCterminusPeptide + + g_staticParams.staticModifications.dAddNterminusPeptide; + } + + if (!strncmp(szBuf, "VariableMod:", 12)) + { + char *tok; + char delims[] = " "; + int x=0; + + bFoundVariable = true; + + tok=strtok(szBuf+13, delims); + while (tok != NULL) + { + tok = strtok(NULL, delims); // skip list of var mod residues + + // for index search, storing variable mods 0-9 in pdStaticMods array 0-9 + sscanf(tok, "%lf:%lf", &(g_staticParams.variableModParameters.varModList[x].dVarModMass), + &(g_staticParams.variableModParameters.varModList[x].dNeutralLoss)); + + if (g_staticParams.variableModParameters.varModList[x].dNeutralLoss != 0.0) + g_staticParams.variableModParameters.bUseFragmentNeutralLoss = true; + + tok = strtok(NULL, delims); + + x++; + if (x == VMODS) + break; + } + break; + } + } + + if (!(bFoundStatic && bFoundVariable)) + { + char szErr[256]; + sprintf(szErr, " Error with index database format. Mods not parsed (%d %d).", bFoundStatic, bFoundVariable); + logerr(szErr); + fclose(fp); + return false; + } + + // indexed searches will always set this to true + g_staticParams.variableModParameters.bVarModSearch = true; + + // read fp of index + comet_fileoffset_t clTmp; + comet_fileoffset_t clProteinsFilePos; + + comet_fseek(fp, -clSizeCometFileOffset*2, SEEK_END); + tTmp = fread(&lEndOfStruct, clSizeCometFileOffset, 1, fp); + tTmp = fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); + + // now read in: vector> g_pvProteinsList + comet_fseek(fp, clProteinsFilePos, SEEK_SET); + size_t tSize; + tTmp = fread(&tSize, clSizeCometFileOffset, 1, fp); + vector vTmp; + + g_pvProteinsList.clear(); + g_pvProteinsList.reserve(tSize); + for (size_t it = 0; it < tSize; ++it) + { + size_t tNumProteinOffsets; + tTmp = fread(&tNumProteinOffsets, clSizeCometFileOffset, 1, fp); + + vTmp.clear(); + for (size_t it2 = 0; it2 < tNumProteinOffsets; ++it2) + { + tTmp = fread(&clTmp, clSizeCometFileOffset, 1, fp); + vTmp.push_back(clTmp); + } + g_pvProteinsList.push_back(vTmp); + } + + // seek to index + comet_fseek(fp, lEndOfStruct, SEEK_SET); + + // read index + int iMinMass=0; + int iMaxMass=0; + uint64_t tNumPeptides=0; + + tTmp = fread(&iMinMass, sizeof(int), 1, fp); + tTmp = fread(&iMaxMass, sizeof(int), 1, fp); + tTmp = fread(&tNumPeptides, sizeof(uint64_t), 1, fp); + + // sanity checks + if (iMinMass < 0 || iMinMass > 20000 || iMaxMass < 0 || iMaxMass > 20000) + { + char szErr[256]; + sprintf(szErr, " Error reading .idx database: min mass %d, max mass %d, num peptides %zu\n", iMinMass, iMaxMass, tNumPeptides); + logerr(szErr); + fclose(fp); + return false; + } + + int iMaxPeptideMass10 = iMaxMass * 10; + comet_fileoffset_t *lReadIndex = new comet_fileoffset_t[iMaxPeptideMass10]; + for (int i=0; i< iMaxPeptideMass10; i++) + lReadIndex[i] = -1; + + tTmp = fread(lReadIndex, sizeof(comet_fileoffset_t), iMaxPeptideMass10, fp); + + int iStart = (int)(g_massRange.dMinMass - 0.5); // smallest mass/index start + int iEnd = (int)(g_massRange.dMaxMass + 0.5); // largest mass/index end + + if (iStart > iMaxMass) // smallest input mass is greater than what's stored in index + { + delete[] lReadIndex; + fclose(fp); + return true; + } + + if (iStart < iMinMass) + iStart = iMinMass; + if (iEnd > iMaxMass) + iEnd = iMaxMass; + + int iStart10 = (int)(g_massRange.dMinMass*10.0 - 0.5); // lReadIndex is at 0.1 resolution for index value so scale iStart/iEnd to be same + int iEnd10 = (int)(g_massRange.dMaxMass*10.0 + 0.5); + + if (iStart10 < iMinMass*10) + iStart10 = iMinMass*10; + if (iEnd10 > iMaxMass*10) + iEnd10 = iMaxMass*10; + + struct DBIndex sDBI; + sDBEntry dbe; + + while (lReadIndex[iStart10] == -1 && iStart10 < iEnd10) + iStart10++; + + if (lReadIndex[iStart10] == -1) // no match found within tolerance + { + delete[] lReadIndex; + fclose(fp); + return true; + } + + comet_fseek(fp, lReadIndex[iStart10], SEEK_SET); + CometPeptideIndex::ReadPeptideIndexEntry(&sDBI, fp); + + while ((int)(sDBI.dPepMass * 10) <= iEnd10) + { +/* + printf("OK index pep "); + for (unsigned int x=0; x g_massRange.dMaxMass) + break; + + CometSearch cs; + int iWhichQuery = cs.BinarySearchMass(0, (int)g_pvQuery.size(), sDBI.dPepMass); + + while (iWhichQuery > 0 && g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= sDBI.dPepMass) + iWhichQuery--; + + // Do the search + if (iWhichQuery != -1) + cs.AnalyzePeptideIndex(iWhichQuery, sDBI, _ppbDuplFragmentArr[0], &dbe); + + if (comet_ftell(fp)>=lEndOfStruct || sDBI.dPepMass>g_massRange.dMaxMass) + break; + + CometPeptideIndex::ReadPeptideIndexEntry(&sDBI, fp); + + // read past last entry in indexed db, need to break out of loop + if (feof(fp)) + break; + + if (g_staticParams.options.iMaxIndexRunTime > 0) + { + // now check search run time + std::chrono::high_resolution_clock::time_point tNow = std::chrono::high_resolution_clock::now(); + auto tElapsedTime = std::chrono::duration_cast(tNow - g_staticParams.tRealTimeStart).count(); + if (tElapsedTime >= g_staticParams.options.iMaxIndexRunTime) + break; + } + } + +// vector::iterator it = g_pvQuery.begin(); + for (vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) // g_pvQuery is always size 1 here; for loop is useless + { + int iNumMatchedPeptides; + + iNumMatchedPeptides = (*it)->iMatchPeptideCount; + if (iNumMatchedPeptides > g_staticParams.options.iNumStored) + iNumMatchedPeptides = g_staticParams.options.iNumStored; + + if (iNumMatchedPeptides > 0) // will retrieve protein names here if there is one or more matched peptides + { + // sort and report results by xcorr + std::sort((*it)->_pResults, (*it)->_pResults + iNumMatchedPeptides, cpa.SortFnXcorr); + + for (int ii = 0; ii < iNumMatchedPeptides; ii++) // loop through all hits to this one spectrum query + { + if (ii > 0 && (*it)->_pResults[ii].fXcorr < (*it)->_pResults[0].fXcorr) // do this only for peptides that have same top xcorr, could be more than 1 + break; + + std::vector::iterator itProt; + bool bPrintDecoyPrefix = false; + + // Note peptides can be from target or internal decoy. If peptide is from a target protein, + // Comet will only report target protein matches and not internal decoy protein matches. + // Decoy proteins only reported for peptides that are exclusively decoy matches. + if ((*it)->_pResults[ii].pWhichProtein.size() > 0) + itProt = (*it)->_pResults[ii].pWhichProtein.begin(); // list of target proteins + else + { + itProt = (*it)->_pResults[ii].pWhichDecoyProtein.begin(); // list of decoy proteins + bPrintDecoyPrefix = true; + } + } +/* + for (int x = 0; x < iNumMatchedPeptides; x++) + { + printf("OK %d scan %d, pep %s, xcorr %f, mass %f, matchcount %d, prot %s\n", x, + (*it)->_spectrumInfoInternal.iScanNumber, + (*it)->_pResults[x].szPeptide, + (*it)->_pResults[x].fXcorr, + (*it)->_pResults[x].dPepMass, + (*it)->iMatchPeptideCount, + (*it)->_pResults[x].strSingleSearchProtein.c_str()); fflush(stdout); + } +*/ + } + } + + delete [] lReadIndex; + fclose(fp); + return true; +} + + +void CometSearch::AnalyzePeptideIndex(int iWhichQuery, + DBIndex sDBI, + bool *pbDuplFragment, + struct sDBEntry *dbe) +{ + int iWhichIonSeries; + int ctIonSeries; + int ctLen; + int ctCharge; + int iLenPeptide = (int)strlen(sDBI.szPeptide); + int iStartPos = 0; + int iEndPos = iLenPeptide - 1; + int iUnused = 0; + bool bFirstTimeThroughLoopForPeptide = true; + + int iFoundVariableMod = 0; // 1 = variable mod, 2 = with fragment NL + int iFoundVariableModDecoy = 0; + + int iPositionNLB[VMODS]; + int iPositionNLY[VMODS]; + + double _pdAAforward[MAX_PEPTIDE_LEN]; // Stores fragment ion fragment ladder calc.; sum AA masses including mods + double _pdAAreverse[MAX_PEPTIDE_LEN]; // Stores n-term fragment ion fragment ladder calc.; sum AA masses including mods + double _pdAAforwardDecoy[MAX_PEPTIDE_LEN]; // Stores fragment ion fragment ladder calc.; sum AA masses including mods + double _pdAAreverseDecoy[MAX_PEPTIDE_LEN]; // Stores n-term fragment ion fragment ladder calc.; sum AA masses including mods + + CometSearch cs; + + if (g_staticParams.variableModParameters.bUseFragmentNeutralLoss) + { + int i; + + for (i=0; i 0) + { + if (g_staticParams.variableModParameters.varModList[sDBI.pcVarModSites[i]-1].dNeutralLoss != 0.0) + { + iPositionNLB[sDBI.pcVarModSites[i] -1] = i; + break; + } + } + } + + for (i=iEndPos; i>=0; i--) + { + if (sDBI.pcVarModSites[i] > 0) + { + if (g_staticParams.variableModParameters.varModList[sDBI.pcVarModSites[i]-1].dNeutralLoss != 0.0) + { + iPositionNLY[sDBI.pcVarModSites[i] -1] = i; + break; + } + } + } + } + + // Compare calculated fragment ions against all matching query spectra. + while (iWhichQuery < (int)g_pvQuery.size()) + { + if (sDBI.dPepMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + { + // If calculated mass is smaller than low mass range. + break; + } + + // Mass tolerance check for particular query against this candidate peptide mass. + if (cs.CheckMassMatch(iWhichQuery, sDBI.dPepMass)) + { + char szDecoyPeptide[MAX_PEPTIDE_LEN]; + int piVarModSites[MAX_PEPTIDE_LEN_P2]; // forward mods, generated from sDBI.sVarModSites + int piVarModSitesDecoy[MAX_PEPTIDE_LEN_P2]; + + int iLen2 = iLenPeptide + 2; + for (int x = 0; x < iLen2; x++) + piVarModSites[x] = sDBI.pcVarModSites[x]; + + // Calculate ion series just once to compare against all relevant query spectra. + if (bFirstTimeThroughLoopForPeptide) + { + int iLenMinus1 = iEndPos - iStartPos; // Equals iLenPeptide minus 1. + int i; + + bFirstTimeThroughLoopForPeptide = false; + + double dBion = g_staticParams.precalcMasses.dNtermProton; + double dYion = g_staticParams.precalcMasses.dCtermOH2Proton; + +/* n/c-term protein mods not supported + if (iStartPos == 0) + dBion += g_staticParams.staticModifications.dAddNterminusProtein; + if (iEndPos == iLenProteinMinus1) + dYion += g_staticParams.staticModifications.dAddCterminusProtein; +*/ + + // variable N-term peptide mod + if (piVarModSites[iLenPeptide] > 0) + { + dBion += g_staticParams.variableModParameters.varModList[piVarModSites[iLenPeptide] - 1].dVarModMass; + iFoundVariableMod = 1; + } + + // variable C-term peptide mod + if (piVarModSites[iLenPeptide + 1] > 0) + { + dYion += g_staticParams.variableModParameters.varModList[piVarModSites[iLenPeptide + 1] - 1].dVarModMass; + iFoundVariableMod = 1; + } + + // Generate pdAAforward for sDBI.szPeptide + for (int i=iStartPos; i 0) + { + dBion += g_staticParams.variableModParameters.varModList[piVarModSites[iPos]-1].dVarModMass; + iFoundVariableMod = 1; + } + + dYion += g_staticParams.massUtility.pdAAMassFragment[(int)sDBI.szPeptide[iPos2]]; + if (piVarModSites[iPos2] > 0) + { + dYion += g_staticParams.variableModParameters.varModList[piVarModSites[iPos2]-1].dVarModMass; + iFoundVariableMod = 1; + } + + _pdAAforward[iPos] = dBion; + _pdAAreverse[iPos] = dYion; + } + + // Now get the set of binned fragment ions once to compare this peptide against all matching spectra. + // First initialize pbDuplFragment and _uiBinnedIonMasses + for (ctCharge=1; ctCharge<=g_massRange.iMaxFragmentCharge; ctCharge++) + { + for (ctIonSeries=0; ctIonSeries= iPositionNLB[x]) // 0/1/2 is a/b/c ions + || (iWhichIonSeries >= 3 && iWhichIonSeries <= 5 && iLenMinus1-ctLen <= iPositionNLY[x])) // 3/4/5 is x/y/z ions + { + double dNewMass = dFragMass - g_staticParams.variableModParameters.varModList[x].dNeutralLoss/ctCharge; + + if (dNewMass >= 0.0) + { + pbDuplFragment[BIN(dNewMass)] = false; + iFoundVariableMod = 2; + } + + _uiBinnedIonMasses[ctCharge][ctIonSeries][ctLen][x+1] = 0; + } + } + } + } + } + } + + for (int ctNL=0; ctNL_spectrumInfoInternal.iChargeState; ctCharge>=1; ctCharge--) + { + double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge*PROTON_MASS)/ctCharge; + int iVal = BIN(dNLMass); + + if (iVal > 0) + { + pbDuplFragment[iVal] = false; + _uiBinnedPrecursorNL[ctNL][ctCharge] = 0; + } + } + } + + for (ctCharge=1; ctCharge<=g_massRange.iMaxFragmentCharge; ctCharge++) + { + for (ctIonSeries=0; ctIonSeries= iPositionNLB[x]) // 0/1/2 is a/b/c ions + || (iWhichIonSeries >= 3 && iWhichIonSeries <= 5 && iLenMinus1-ctLen <= iPositionNLY[x])) // 3/4/5 is x/y/z ions + { + double dNewMass = dFragMass - g_staticParams.variableModParameters.varModList[x].dNeutralLoss/ctCharge; + + iVal = BIN(dNewMass); + + if (iVal > 0 && pbDuplFragment[iVal] == false) + { + _uiBinnedIonMasses[ctCharge][ctIonSeries][ctLen][x+1] = iVal; + pbDuplFragment[iVal] = true; + } + } + } + } + } + } + } + + // Precursor NL peaks added here + for (int ctNL=0; ctNL_spectrumInfoInternal.iChargeState; ctCharge>=1; ctCharge--) + { + double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge*PROTON_MASS)/ctCharge; + int iVal = BIN(dNLMass); + + if (iVal > 0 && pbDuplFragment[iVal] == false) + { + _uiBinnedPrecursorNL[ctNL][ctCharge] = iVal; + pbDuplFragment[iVal] = true; + } + } + } + + if (g_staticParams.options.iDecoySearch) + { + if (g_staticParams.enzymeInformation.iSearchEnzymeOffSet == 1) + { + // last residue stays the same: change ABCDEK to EDCBAK + + for (i = iEndPos - 1; i >= iStartPos; i--) + { + szDecoyPeptide[iEndPos - i - 1] = sDBI.szPeptide[i - iStartPos]; + piVarModSitesDecoy[iEndPos - i - 1] = piVarModSites[i - iStartPos]; + } + + szDecoyPeptide[iEndPos] = sDBI.szPeptide[iEndPos]; // last residue stays same + piVarModSitesDecoy[iLenPeptide - 1] = piVarModSites[iLenPeptide - 1]; + } + else + { + // first residue stays the same: change ABCDEK to AKEDCB + + for (i = iEndPos; i > iStartPos; i--) + { + szDecoyPeptide[iEndPos - i + 1] = sDBI.szPeptide[i - iStartPos]; + piVarModSitesDecoy[iEndPos - i + 1] = piVarModSites[i - iStartPos]; + } + + szDecoyPeptide[iStartPos] = sDBI.szPeptide[iStartPos]; // first residue stays same + piVarModSitesDecoy[iStartPos] = piVarModSites[iStartPos]; + } + + piVarModSitesDecoy[iLenPeptide] = piVarModSites[iLenPeptide]; // N-term + piVarModSitesDecoy[iLenPeptide + 1] = piVarModSites[iLenPeptide + 1]; // C-term + + // Now need to recalculate _pdAAforward and _pdAAreverse for decoy entry + dBion = g_staticParams.precalcMasses.dNtermProton; + dYion = g_staticParams.precalcMasses.dCtermOH2Proton; + +/* n/c-term protein mods not supported + // use same protein terminal static mods as target peptide + if (_varModInfo.iStartPos == 0) + dBion += g_staticParams.staticModifications.dAddNterminusProtein; + if (_varModInfo.iEndPos == iLenProteinMinus1) + dYion += g_staticParams.staticModifications.dAddCterminusProtein; +*/ + + // variable N-term + if (piVarModSitesDecoy[iLenPeptide] > 0) + dBion += g_staticParams.variableModParameters.varModList[piVarModSitesDecoy[iLenPeptide] - 1].dVarModMass; + + // variable C-term + if (piVarModSitesDecoy[iLenPeptide + 1] > 0) + dYion += g_staticParams.variableModParameters.varModList[piVarModSitesDecoy[iLenPeptide + 1] - 1].dVarModMass; + + int iDecoyStartPos = iStartPos; + int iDecoyEndPos = iEndPos; + + // Generate pdAAforward for szDecoyPeptide + for (i = iDecoyStartPos; i < iDecoyEndPos; i++) + { + int iPos = i - iDecoyStartPos; + int iPos2 = iDecoyEndPos - i + iDecoyStartPos; + + dBion += g_staticParams.massUtility.pdAAMassFragment[(int)szDecoyPeptide[i]]; + if (piVarModSitesDecoy[iPos] > 0) + { + dBion += g_staticParams.variableModParameters.varModList[piVarModSitesDecoy[iPos] - 1].dVarModMass; + iFoundVariableModDecoy = 1; + } + + dYion += g_staticParams.massUtility.pdAAMassFragment[(int)szDecoyPeptide[iPos2]]; + if (piVarModSitesDecoy[iPos2] > 0) + { + dYion += g_staticParams.variableModParameters.varModList[piVarModSitesDecoy[iPos2] - 1].dVarModMass; + iFoundVariableModDecoy = 1; + } + + _pdAAforwardDecoy[iPos] = dBion; + _pdAAreverseDecoy[iPos] = dYion; + } + + // Now get the set of binned fragment ions once to compare this peptide against all matching spectra. + // First initialize pbDuplFragment and _uiBinnedIonMassesDecoy + for (ctCharge = 1; ctCharge <= g_massRange.iMaxFragmentCharge; ctCharge++) + { + for (ctIonSeries = 0; ctIonSeries < g_staticParams.ionInformation.iNumIonSeriesUsed; ctIonSeries++) + { + iWhichIonSeries = g_staticParams.ionInformation.piSelectedIonSeries[ctIonSeries]; + + for (ctLen = 0; ctLen < iLenMinus1; ctLen++) + { + double dFragMass = CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy); + + pbDuplFragment[BIN(dFragMass)] = false; + _uiBinnedIonMassesDecoy[ctCharge][ctIonSeries][ctLen][0] = 0; + + // initialize fragmentNL + if (g_staticParams.variableModParameters.bUseFragmentNeutralLoss) + { + for (int x = 0; x < VMODS; x++) // should be within this if() because only looking for NL masses from each mod + { + if ((iWhichIonSeries <= 2 && ctLen >= iPositionNLB[x]) // 0/1/2 is a/b/c ions + || (iWhichIonSeries >= 3 && iWhichIonSeries <= 5 && iLenMinus1 - ctLen <= iPositionNLY[x])) // 3/4/5 is x/y/z ions + { + double dNewMass = dFragMass - g_staticParams.variableModParameters.varModList[x].dNeutralLoss / ctCharge; + + if (dNewMass >= 0.0) + { + pbDuplFragment[BIN(dNewMass)] = false; + iFoundVariableModDecoy = 2; + } + + _uiBinnedIonMassesDecoy[ctCharge][ctIonSeries][ctLen][x + 1] = 0; + } + } + } + } + } + } + + for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) + { + for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.iChargeState; ctCharge >= 1; ctCharge--) + { + double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; + int iVal = BIN(dNLMass); + + if (iVal > 0) + { + pbDuplFragment[iVal] = false; + _uiBinnedPrecursorNLDecoy[ctNL][ctCharge] = 0; + } + } + } + + for (ctCharge = 1; ctCharge <= g_massRange.iMaxFragmentCharge; ctCharge++) + { + for (ctIonSeries = 0; ctIonSeries < g_staticParams.ionInformation.iNumIonSeriesUsed; ctIonSeries++) + { + iWhichIonSeries = g_staticParams.ionInformation.piSelectedIonSeries[ctIonSeries]; + + // As both _pdAAforward and _pdAAreverse are increasing, loop through + // iLenPeptide-1 to complete set of internal fragment ions. + for (ctLen = 0; ctLen < iLenMinus1; ctLen++) + { + double dFragMass = CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy); + int iVal = BIN(dFragMass); + + if (pbDuplFragment[iVal] == false) + { + _uiBinnedIonMassesDecoy[ctCharge][ctIonSeries][ctLen][0] = iVal; + pbDuplFragment[iVal] = true; + } + + if (g_staticParams.variableModParameters.bUseFragmentNeutralLoss) + { + for (int x = 0; x < VMODS; x++) + { + if ((iWhichIonSeries <= 2 && ctLen >= iPositionNLB[x]) // 0/1/2 is a/b/c ions + || (iWhichIonSeries >= 3 && iWhichIonSeries <= 5 && iLenMinus1 - ctLen <= iPositionNLY[x])) // 3/4/5 is x/y/z ions + { + double dNewMass = dFragMass - g_staticParams.variableModParameters.varModList[x].dNeutralLoss / ctCharge; + + iVal = BIN(dNewMass); + + if (iVal > 0 && pbDuplFragment[iVal] == false) + { + _uiBinnedIonMassesDecoy[ctCharge][ctIonSeries][ctLen][x + 1] = iVal; + pbDuplFragment[iVal] = true; + } + } + } + } + } + } + } + + // Precursor NL peaks added here + for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) + { + for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.iChargeState; ctCharge >= 1; ctCharge--) + { + double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; + int iVal = BIN(dNLMass); + + if (iVal > 0 && pbDuplFragment[iVal] == false) + { + _uiBinnedPrecursorNLDecoy[ctNL][ctCharge] = iVal; + pbDuplFragment[iVal] = true; + } + } + } + } + } + + XcorrScore(sDBI.szPeptide, iUnused, iUnused, iStartPos, iEndPos, iFoundVariableMod, + sDBI.dPepMass, false, iWhichQuery, iLenPeptide, piVarModSites, dbe); + + if (g_staticParams.options.iDecoySearch) + { + XcorrScore(szDecoyPeptide, iUnused, iUnused, iStartPos, iEndPos, iFoundVariableModDecoy, + sDBI.dPepMass, true, iWhichQuery, iLenPeptide, piVarModSitesDecoy, dbe); + } + } + iWhichQuery++; + } +} + + // Compare MSMS data to peptide with szProteinSeq from the input database. // iNtermPeptideOnly==0 specifies normal sequence // iNtermPeptideOnly==1 specifies clipped methionine sequence @@ -1961,7 +2758,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - pbDuplFragment[BIN(GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforward, _pdAAreverse))] = false; + pbDuplFragment[BIN(CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforward, _pdAAreverse))] = false; _uiBinnedIonMasses[ctCharge][ctIonSeries][ctLen][0] = 0; // note no need to initialize fragment NL positions as no mods here } @@ -1995,7 +2792,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, // iLenPeptide-1 to complete set of internal fragment ions. for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - int iVal = BIN(GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforward, _pdAAreverse)); + int iVal = BIN(CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforward, _pdAAreverse)); if (pbDuplFragment[iVal] == false) { @@ -2110,7 +2907,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - pbDuplFragment[BIN(GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy))] = false; + pbDuplFragment[BIN(CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy))] = false; _uiBinnedIonMassesDecoy[ctCharge][ctIonSeries][ctLen][0] = 0; } } @@ -2142,7 +2939,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, // iLenPeptide-1 to complete set of internal fragment ions. for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - double dFragMass = GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy); + double dFragMass = CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy); int iVal = BIN(dFragMass); if (pbDuplFragment[iVal] == false) @@ -3286,7 +4083,7 @@ void CometSearch::XcorrScore(char *szProteinSeq, dXcorr *= 0.005; // Scale intensities to 50 and divide score by 1E4. - dXcorr= std::round(dXcorr* 1000.0) / 1000.0; // round to 3 decimal points + dXcorr = std::round(dXcorr* 1000.0) / 1000.0; // round to 3 decimal points Threading::LockMutex(pQuery->accessMutex); @@ -3481,6 +4278,7 @@ void CometSearch::XcorrScoreI(char *szProteinSeq, } +/* double CometSearch::GetFragmentIonMass(int iWhichIonSeries, int i, int ctCharge, @@ -3516,6 +4314,7 @@ double CometSearch::GetFragmentIonMass(int iWhichIonSeries, return (dFragmentIonMass + (ctCharge - 1.0) * PROTON_MASS) / ctCharge; } +*/ void CometSearch::StorePeptide(int iWhichQuery, @@ -3627,10 +4426,10 @@ void CometSearch::StorePeptide(int iWhichQuery, pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].fXcorr = (float)dXcorr; - if (g_staticParams.iIndexDb) + if (g_staticParams.iIndexDb) //FIX { - pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cPrevAA = _proteinInfo.cPrevAA; - pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cNextAA = _proteinInfo.cNextAA; +// pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cPrevAA = _proteinInfo.cPrevAA; +// pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cNextAA = _proteinInfo.cNextAA; } else { @@ -6129,7 +6928,7 @@ bool CometSearch::CalcVarModIons(char *szProteinSeq, for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - double dFragMass = GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforward, _pdAAreverse); + double dFragMass = CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforward, _pdAAreverse); pbDuplFragment[BIN(dFragMass)] = false; _uiBinnedIonMasses[ctCharge][ctIonSeries][ctLen][0] = 0; @@ -6191,7 +6990,7 @@ bool CometSearch::CalcVarModIons(char *szProteinSeq, // iLenPeptide-1 to complete set of internal fragment ions for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - double dFragMass = GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforward, _pdAAreverse); + double dFragMass = CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforward, _pdAAreverse); int iVal = BIN(dFragMass); @@ -6433,7 +7232,7 @@ bool CometSearch::CalcVarModIons(char *szProteinSeq, for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - double dFragMass = GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy); + double dFragMass = CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy); pbDuplFragment[BIN(dFragMass)] = false; _uiBinnedIonMassesDecoy[ctCharge][ctIonSeries][ctLen][0] = 0; @@ -6486,7 +7285,7 @@ bool CometSearch::CalcVarModIons(char *szProteinSeq, // iLenPeptide-1 to complete set of internal fragment ions for (ctLen = 0; ctLen < iLenMinus1; ++ctLen) { - double dFragMass = GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy); + double dFragMass = CometMassSpecUtils::GetFragmentIonMass(iWhichIonSeries, ctLen, ctCharge, _pdAAforwardDecoy, _pdAAreverseDecoy); int iVal = BIN(dFragMass); if (pbDuplFragment[iVal] == false) diff --git a/CometSearch/CometSearch.h b/CometSearch/CometSearch.h index 612d1b8b..6558bea5 100644 --- a/CometSearch/CometSearch.h +++ b/CometSearch/CometSearch.h @@ -16,9 +16,7 @@ #ifndef _COMETSEARCH_H_ #define _COMETSEARCH_H_ -#include "Common.h" #include "CometDataInternal.h" -#include struct SearchThreadData { @@ -83,6 +81,11 @@ class CometSearch int iStartPos); bool CheckEnzymeEndTermini(char *szProteinSeq, int iEndPos); + int BinarySearchMass(int start, + int end, + double dCalcPepMass); + bool CheckMassMatch(int iWhichQuery, + double dCalcPepMass); struct ProteinInfo { @@ -114,9 +117,6 @@ class CometSearch int end, string strMod, vector& vectorPeffOBO); - int BinarySearchMass(int start, - int end, - double dCalcPepMass); static int BinarySearchIndexMass(int iWhichThread, int iPrecursorBin, int start, @@ -166,13 +166,13 @@ class CometSearch unsigned int uiBinnedIonMasses[MAX_FRAGMENT_CHARGE+1][NUM_ION_SERIES][MAX_PEPTIDE_LEN][FRAGINDEX_VMODS+1], unsigned int uiBinnedPrecursorNL[MAX_PRECURSOR_NL_SIZE][MAX_PRECURSOR_CHARGE], int iNumMatchedFragmentIons); - bool CheckMassMatch(int iWhichQuery, - double dCalcPepMass); +/* static double GetFragmentIonMass(int iWhichIonSeries, int i, int ctCharge, double *pdAAforward, double *pdAAreverse); +*/ int CheckDuplicate(int iWhichQuery, int iStartResidue, int iEndResidue, @@ -239,6 +239,11 @@ class CometSearch struct sDBEntry *dbe); static void SearchFragmentIndex(size_t iWhichQuery, ThreadPool *tp); + static bool SearchPeptideIndex(void); + void AnalyzePeptideIndex(int iWhichQuery, + DBIndex sDBI, + bool *pbDuplFragment, + struct sDBEntry *dbe); bool SearchForPeptides(struct sDBEntry dbe, char *szProteinSeq, int iNtermPeptideOnly, // used in clipped methionine sequence diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 2efdbc07..d748dfbe 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -28,7 +28,7 @@ #include "CometSearchManager.h" #include "CometStatus.h" #include "CometFragmentIndex.h" - +#include "CometPeptideIndex.h" #include @@ -54,7 +54,8 @@ bool* g_bIndexPrecursors; // array for BIN(pre vector g_vFragmentPeptides; // each peptide is represented here iWhichPeptide, which mod if any, calculated mass vector g_vRawPeptides; // list of unmodified peptides and their proteins as file pointers bool g_bPlainPeptideIndexRead = false; -FILE* fpfasta; +bool g_bPeptideIndexRead = false; +FILE* fpfasta; // file pointer to FASTA; would be same as fpdb if input db was already FASTA but otherwise needed if input is .idx file /****************************************************************************** @@ -474,6 +475,20 @@ static bool ValidateSequenceDatabaseFile() FILE *fpcheck; char szErrorMsg[SIZE_ERROR]; + // open FASTA for retrieving protein names + string sTmpDB = g_staticParams.databaseInfo.szDatabase; + if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) + sTmpDB = sTmpDB.erase(sTmpDB.size() - 4); // need plain fasta if indexdb input + if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == NULL) + { + char szErrorMsg[SIZE_ERROR]; + sprintf(szErrorMsg, " Error (3) - cannot read FASTA file \"%s\".\n", sTmpDB.c_str()); + string strErrorMsg(szErrorMsg); + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(szErrorMsg); + return false; + } + // if .idx database specified but does not exist, first see if corresponding // fasta exists and if it does, create the .idx file if (strstr(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) @@ -561,6 +576,7 @@ static bool ValidateSequenceDatabaseFile() } fclose(fpcheck); + return true; } @@ -861,6 +877,7 @@ bool CometSearchManager::InitializeStaticParams() GetParamValue("scale_fragmentNL", g_staticParams.options.bScaleFragmentNL); GetParamValue("create_fragment_index", g_staticParams.options.bCreateFragmentIndex); + GetParamValue("create_peptide_index", g_staticParams.options.bCreatePeptideIndex); GetParamValue("max_iterations", g_staticParams.options.lMaxIterations); @@ -1640,6 +1657,7 @@ bool CometSearchManager::InitializeStaticParams() g_staticParams.options.iMaxDuplicateProteins = INT_MAX; g_staticParams.iPrecursorNLSize = (int)g_staticParams.precursorNLIons.size(); + if (g_staticParams.iPrecursorNLSize > MAX_PRECURSOR_NL_SIZE) g_staticParams.iPrecursorNLSize = MAX_PRECURSOR_NL_SIZE; @@ -1650,18 +1668,49 @@ bool CometSearchManager::InitializeStaticParams() g_staticParams.options.iFragIndexNumThreads = (g_staticParams.options.iNumThreads > FRAGINDEX_MAX_THREADS ? FRAGINDEX_MAX_THREADS : g_staticParams.options.iNumThreads); // At this point, check extension to set whether index database or not - if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 7, ".pepidx")) - { - g_staticParams.iIndexDb = 2; // peptide index - } - else if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) + if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) { - g_staticParams.iIndexDb = 1; // fragment ion index + // Has .idx extension. Now parse first line ot see if peptide index or fragment index. + // either "Comet peptide index" or "Comet fragment ion index" + char szTmp[512]; + FILE *fp; + + if ( (fp=fopen(g_staticParams.databaseInfo.szDatabase, "r")) == NULL) + { + char szErrorMsg[SIZE_ERROR]; + sprintf(szErrorMsg, " Error - cannot open database index file \"%s\".\n", g_staticParams.databaseInfo.szDatabase); + string strErrorMsg(szErrorMsg); + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(szErrorMsg); + return false; + } - // if searching fragment index database, limit load of query spectra as no - // need to load all spectra into memory since querying spectra sequentially - if (g_staticParams.options.iSpectrumBatchSize > FRAGINDEX_MAX_BATCHSIZE || g_staticParams.options.iSpectrumBatchSize == 0) - g_staticParams.options.iSpectrumBatchSize = FRAGINDEX_MAX_BATCHSIZE; + fgets(szTmp, 512, fp); + fclose(fp); + + if (!strncmp(szTmp, "Comet peptide index", 19)) + { + g_staticParams.iIndexDb = 2; // peptide index + } + else if (!strncmp(szTmp, "Comet fragment ion index", 24)) + { + g_staticParams.iIndexDb = 1; // fragment ion index + + // if searching fragment index database, limit load of query spectra as no + // need to load all spectra into memory since querying spectra sequentially + if (g_staticParams.options.iSpectrumBatchSize > FRAGINDEX_MAX_BATCHSIZE || g_staticParams.options.iSpectrumBatchSize == 0) + g_staticParams.options.iSpectrumBatchSize = FRAGINDEX_MAX_BATCHSIZE; + } + else + { + char szErrorMsg[SIZE_ERROR]; + sprintf(szErrorMsg, " Error - first line of database index file \"%s\" contains:\n", g_staticParams.databaseInfo.szDatabase); + sprintf(szErrorMsg+strlen(szErrorMsg), "%s\n", szTmp); + string strErrorMsg(szErrorMsg); + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(szErrorMsg); + return false; + } } if (g_staticParams.options.bCreateFragmentIndex && g_staticParams.iIndexDb) @@ -1674,7 +1723,7 @@ bool CometSearchManager::InitializeStaticParams() return false; } - if (g_staticParams.iIndexDb) + if (g_staticParams.iIndexDb == 1) { g_bIndexPrecursors = (bool*) malloc(BIN(g_staticParams.options.dPeptideMassHigh)); if (g_bIndexPrecursors == NULL) @@ -2032,6 +2081,12 @@ bool CometSearchManager::DoSearch() fflush(stdout); } + if (g_staticParams.options.bCreatePeptideIndex) + { + bSucceeded = CometPeptideIndex::WritePeptideIndex(tp); + return bSucceeded; + } + if (g_staticParams.options.bCreateFragmentIndex || !g_staticParams.iIndexDb) { // If specified, read in the protein variable mod filter file content. @@ -2086,7 +2141,7 @@ bool CometSearchManager::DoSearch() bool bBlankSearchFile = false; - if (g_staticParams.iIndexDb) + if (g_staticParams.iIndexDb == 1) { if (!g_staticParams.options.iFragIndexSkipReadPrecursors) { @@ -2534,7 +2589,7 @@ bool CometSearchManager::DoSearch() FILE *fpdb; // need FASTA file again to grab headers for output (currently just store file positions) string sTmpDB = g_staticParams.databaseInfo.szDatabase; - if (g_staticParams.iIndexDb) + if (g_staticParams.iIndexDb > 0) sTmpDB = sTmpDB.erase(sTmpDB.size()-4); // need plain fasta if indexdb input if ((fpdb=fopen(sTmpDB.c_str(), "r")) == NULL) { @@ -2554,12 +2609,12 @@ bool CometSearchManager::DoSearch() CometFragmentIndex sqSearch; - if (g_staticParams.iIndexDb) + if (g_staticParams.iIndexDb == 1) { if (!g_bPlainPeptideIndexRead) { auto tStartTime = chrono::steady_clock::now(); - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iIndexDb) + if (!g_staticParams.options.bOutputSqtStream) { cout << " - read .idx ... "; fflush(stdout); @@ -2567,7 +2622,7 @@ bool CometSearchManager::DoSearch() sqSearch.ReadPlainPeptideIndex(); - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iIndexDb) + if (!g_staticParams.options.bOutputSqtStream) { cout << CometFragmentIndex::ElapsedTime(tStartTime) << endl; } @@ -2600,7 +2655,6 @@ bool CometSearchManager::DoSearch() char szTimeBuffer[32]; szTimeBuffer[0] = '\0'; #endif - // Load and preprocess all the spectra. if (!g_staticParams.options.bOutputSqtStream && !g_staticParams.iIndexDb) { @@ -2773,7 +2827,6 @@ bool CometSearchManager::DoSearch() fflush(stdout); } #endif - // Sort g_pvQuery vector by scan. std::sort(g_pvQuery.begin(), g_pvQuery.end(), compareByScanNumber); @@ -2803,20 +2856,19 @@ bool CometSearchManager::DoSearch() && g_staticParams.variableModParameters.varModList[iNtermMod - 1].iWhichTerm == 0) { // only match to peptides at the N-terminus of proteins as protein terminal mod applied - CometMassSpecUtils::GetPrevNextAA(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, 1); + CometMassSpecUtils::GetPrevNextAA(fpfasta, iWhichQuery, iWhichResult, iPrintTargetDecoy, 1); } else if (iCtermMod > 0 && g_staticParams.variableModParameters.varModList[iCtermMod - 1].iVarModTermDistance == 0 && g_staticParams.variableModParameters.varModList[iCtermMod - 1].iWhichTerm == 1) { // only match to peptides at the C-terminus of proteins as protein terminal mod applied - CometMassSpecUtils::GetPrevNextAA(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, 2); + CometMassSpecUtils::GetPrevNextAA(fpfasta, iWhichQuery, iWhichResult, iPrintTargetDecoy, 2); } else { - // peptide can be anywhere in sequence - CometMassSpecUtils::GetPrevNextAA(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, 0); + CometMassSpecUtils::GetPrevNextAA(fpfasta, iWhichQuery, iWhichResult, iPrintTargetDecoy, 0); } } } @@ -2855,7 +2907,9 @@ bool CometSearchManager::DoSearch() } if (g_staticParams.options.bOutputTxtFile) + { CometWriteTxt::WriteTxt(fpout_txt, fpoutd_txt, fpdb); + } // Write SQT last as I destroy the g_staticParams.szMod string during that process if (g_staticParams.options.bOutputSqtStream || g_staticParams.options.bOutputSqtFile) @@ -3055,7 +3109,7 @@ bool CometSearchManager::DoSearch() break; } - if (g_staticParams.iIndexDb) + if (g_staticParams.iIndexDb == 1) // clean fragment ion index { int iNumIndexingThreads = g_staticParams.options.iNumThreads; if (iNumIndexingThreads > FRAGINDEX_MAX_THREADS) @@ -3128,6 +3182,8 @@ bool CometSearchManager::InitializeSingleSpectrumSearch() sqSearch.CreateFragmentIndex(tp); } +/* now done in ValidateSequenceDatabaseFile() */ +/* // open FASTA for retrieving protein names string sTmpDB = g_staticParams.databaseInfo.szDatabase; if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) @@ -3141,6 +3197,7 @@ bool CometSearchManager::InitializeSingleSpectrumSearch() logerr(szErrorMsg); return false; } +*/ singleSearchInitializationComplete = true; @@ -3192,6 +3249,9 @@ bool CometSearchManager::DoSingleSpectrumSearch(int iPrecursorCharge, if (!InitializeSingleSpectrumSearch()) return false; + // tRealTimeStart used to track elapsed search time and to exit if g_staticParams.options.iMaxIndexRunTime is surpased + g_staticParams.tRealTimeStart = std::chrono::high_resolution_clock::now(); + // We need to reset some of the static variables in-between input files CometPreprocess::Reset(); diff --git a/CometSearch/Common.h b/CometSearch/Common.h index 2792d0f0..be0f7aa8 100644 --- a/CometSearch/Common.h +++ b/CometSearch/Common.h @@ -69,7 +69,7 @@ using namespace std; #define GITHUBSHA "" #endif -#define comet_version "2024.02 rev. 0" +#define comet_version "2024.02 rev. 1" #define copyright "(c) University of Washington" extern string g_sCometVersion; // version string including git hash diff --git a/CometSearch/ModificationsPermuter.cpp b/CometSearch/ModificationsPermuter.cpp index 76ad9b30..7a652db5 100644 --- a/CometSearch/ModificationsPermuter.cpp +++ b/CometSearch/ModificationsPermuter.cpp @@ -22,10 +22,11 @@ #include #include #include -#include "CombinatoricsUtils.h" +#include "Common.h" +#include "CometDataInternal.h" #include "ModificationsPermuter.h" +#include "CombinatoricsUtils.h" #include "CometFragmentIndex.h" -#include "Common.h" //using namespace std; diff --git a/CometSearch/ModificationsPermuter.h b/CometSearch/ModificationsPermuter.h index 5ff6bf22..114de74b 100644 --- a/CometSearch/ModificationsPermuter.h +++ b/CometSearch/ModificationsPermuter.h @@ -16,9 +16,6 @@ #ifndef _MODIFICATIONSPERMUTER_H_ #define _MODIFICATIONSPERMUTER_H_ -#include "Common.h" -#include "CometDataInternal.h" - class ModificationsPermuter { public: diff --git a/Makefile b/Makefile index 25fbcf3b..9d002a6d 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,8 @@ DEPS = CometSearch/CometData.h CometSearch/CometDataInternal.h CometSearch/Comet CometSearch/CometPostAnalysis.cpp CometSearch/CometSearchManager.cpp CometSearch/CometWritePercolator.cpp CometSearch/Threading.cpp\ CometSearch/CometPreprocess.cpp CometSearch/CometWriteOut.cpp CometSearch/CometWriteSqt.cpp CometSearch/CombinatoricsUtils.cpp\ CometSearch/ModificationsPermuter.cpp CometSearch/CometInterfaces.h CometSearch/CometInterfaces.cpp\ - CometSearch/CometFragmentIndex.cpp CometSearch/CometFragmentIndex.h + CometSearch/CometFragmentIndex.cpp CometSearch/CometFragmentIndex.h\ + CometSearch/CometPeptideIndex.cpp CometSearch/CometPeptideIndex.h LIBPATHS = -L$(MSTOOLKIT) -L$(COMETSEARCH) LIBS = -lcometsearch -lmstoolkitlite -lm -lpthread From d11f9c540667ecbe0ece196afd768acddbcbef42 Mon Sep 17 00:00:00 2001 From: jke000 Date: Wed, 11 Dec 2024 12:02:55 -0800 Subject: [PATCH 03/18] add CometPeptideIndex files --- CometSearch/CometPeptideIndex.cpp | 325 ++++++++++++++++++++++++++++++ CometSearch/CometPeptideIndex.h | 53 +++++ 2 files changed, 378 insertions(+) create mode 100644 CometSearch/CometPeptideIndex.cpp create mode 100644 CometSearch/CometPeptideIndex.h diff --git a/CometSearch/CometPeptideIndex.cpp b/CometSearch/CometPeptideIndex.cpp new file mode 100644 index 00000000..9118beb1 --- /dev/null +++ b/CometSearch/CometPeptideIndex.cpp @@ -0,0 +1,325 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "CometPeptideIndex.h" + +extern comet_fileoffset_t clSizeCometFileOffset; + + +CometPeptideIndex::CometPeptideIndex() +{ +} + +CometPeptideIndex::~CometPeptideIndex() +{ +} + + +bool CometPeptideIndex::WritePeptideIndex(ThreadPool *tp) +{ + bool bSucceeded; + char szOut[256]; + FILE *fptr; + + const int iIndex_SIZE_FILE=SIZE_FILE+4; + char szIndexFile[iIndex_SIZE_FILE]; + sprintf(szIndexFile, "%s.idx", g_staticParams.databaseInfo.szDatabase); + + if ((fptr = fopen(szIndexFile, "wb")) == NULL) + { + printf(" Error - cannot open index file %s to write\n", szIndexFile); + exit(1); + } + + sprintf(szOut, " Creating peptide index file: "); + logout(szOut); + fflush(stdout); + + bSucceeded = CometSearch::AllocateMemory(g_staticParams.options.iNumThreads); + + g_massRange.dMinMass = g_staticParams.options.dPeptideMassLow; + g_massRange.dMaxMass = g_staticParams.options.dPeptideMassHigh; + + tp->fillPool( g_staticParams.options.iNumThreads < 0 ? 0 : g_staticParams.options.iNumThreads-1); + if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) + g_massRange.bNarrowMassRange = true; + else + g_massRange.bNarrowMassRange = false; + + if (bSucceeded) + { + printf("OK before RunSearch\n"); + bSucceeded = CometSearch::RunSearch(0, 0, tp); + } + + if (!bSucceeded) + { + char szErrorMsg[SIZE_ERROR]; + sprintf(szErrorMsg, " Error in RunSearch() for peptide index creation.\n"); + logerr(szErrorMsg); + CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); + return false; + } + + // sanity check + if (g_pvDBIndex.size() == 0) + { + char szErrorMsg[SIZE_ERROR]; + sprintf(szErrorMsg, " Error: no peptides in index; check the input database file or search parameters.\n"); + logerr(szErrorMsg); + CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); + return false; + } + + // remove duplicates + sprintf(szOut, " - removing duplicates\n"); + logout(szOut); + fflush(stdout); + + // keep unique entries only; sort by peptide/modification state and protein + // first sort by peptide, then mod state, then protein file position + sort(g_pvDBIndex.begin(), g_pvDBIndex.end(), CometFragmentIndex::CompareByPeptide); + + // At this point, need to create g_pvProteinsList protein file position vector of vectors to map each peptide + // to every protein. g_pvDBIndex.at().lProteinFilePosition is now reference to protein vector entry + vector> g_pvProteinsList; + vector temp; // stores list of duplicate proteins which gets pushed to g_pvProteinsList + + // Create g_pvProteinsList. This is a vector of vectors. Each element is vector list + // of duplicate proteins (generated as "temp") ... these are generated by looping + // through g_pvDBIndex and looking for consecutive, same peptides. Once the "temp" + // vector is assigned the lIndexProteinFilePosition offset, the g_pvDBIndex entry is + // is assigned lProtCount to lIndexProteinFilePosition. This is used later to look up + // the right vector element of duplicate proteins later. + long lProtCount = 0; + for (size_t i = 0; i < g_pvDBIndex.size(); i++) + { + if (i == 0) + { + temp.push_back(g_pvDBIndex.at(i).lIndexProteinFilePosition); + g_pvDBIndex.at(i).lIndexProteinFilePosition = lProtCount; + } + else + { + // each unique peptide, irregardless of mod state, will have the same list + // of matched proteins + if (!strcmp(g_pvDBIndex.at(i).szPeptide, g_pvDBIndex.at(i-1).szPeptide)) + { + temp.push_back(g_pvDBIndex.at(i).lIndexProteinFilePosition); + g_pvDBIndex.at(i).lIndexProteinFilePosition = lProtCount; + } + else + { + // different peptide + mod state so go ahead and push temp onto g_pvProteinsList + // and store current protein reference into new temp + // temp can have duplicates due to mod forms of peptide so make unique here + sort(temp.begin(), temp.end()); + temp.erase(unique(temp.begin(), temp.end()), temp.end() ); + g_pvProteinsList.push_back(temp); + + lProtCount++; // start new row in g_pvProteinsList + temp.clear(); + temp.push_back(g_pvDBIndex.at(i).lIndexProteinFilePosition); + g_pvDBIndex.at(i).lIndexProteinFilePosition = lProtCount; + } + } + } + // now at end of loop, push last temp onto g_pvProteinsList + sort(temp.begin(), temp.end()); + temp.erase(unique(temp.begin(), temp.end()), temp.end() ); + g_pvProteinsList.push_back(temp); + + g_pvDBIndex.erase(unique(g_pvDBIndex.begin(), g_pvDBIndex.end()), g_pvDBIndex.end()); + + // sort by mass; + sort(g_pvDBIndex.begin(), g_pvDBIndex.end(), CometFragmentIndex::CompareByMass); + +/* + for (std::vector::iterator it = g_pvDBIndex.begin(); it != g_pvDBIndex.end(); ++it) + { + printf("OK after unique "); + if ((*it).pcVarModSites[strlen((*it).szPeptide)] != 0) + printf("n*"); + for (unsigned int x = 0; x < strlen((*it).szPeptide); x++) + { + printf("%c", (*it).szPeptide[x]); + if ((*it).pcVarModSites[x] != 0) + printf("*"); + } + if ((*it).pcVarModSites[strlen((*it).szPeptide) + 1] != 0) + printf("c*"); + printf(" %f %lld\n", (*it).dPepMass, (*it).lIndexProteinFilePosition); + } + printf("\n"); +*/ + + sprintf(szOut, " - writing file\n"); + logout(szOut); + fflush(stdout); + + // write out index header + fprintf(fptr, "Comet peptide index database. Comet version %s\n", g_sCometVersion.c_str()); + fprintf(fptr, "InputDB: %s\n", g_staticParams.databaseInfo.szDatabase); + fprintf(fptr, "MassRange: %lf %lf\n", g_staticParams.options.dPeptideMassLow, g_staticParams.options.dPeptideMassHigh); + fprintf(fptr, "MassType: %d %d\n", g_staticParams.massUtility.bMonoMassesParent, g_staticParams.massUtility.bMonoMassesFragment); + fprintf(fptr, "Enzyme: %s [%d %s %s]\n", g_staticParams.enzymeInformation.szSearchEnzymeName, + g_staticParams.enzymeInformation.iSearchEnzymeOffSet, + g_staticParams.enzymeInformation.szSearchEnzymeBreakAA, + g_staticParams.enzymeInformation.szSearchEnzymeNoBreakAA); + fprintf(fptr, "Enzyme2: %s [%d %s %s]\n", g_staticParams.enzymeInformation.szSearchEnzyme2Name, + g_staticParams.enzymeInformation.iSearchEnzyme2OffSet, + g_staticParams.enzymeInformation.szSearchEnzyme2BreakAA, + g_staticParams.enzymeInformation.szSearchEnzyme2NoBreakAA); + fprintf(fptr, "NumPeptides: %ld\n", (long)g_pvDBIndex.size()); + + // write out static mod params A to Z is ascii 65 to 90 then terminal mods + fprintf(fptr, "StaticMod:"); + for (int x = 65; x <= 90; x++) + fprintf(fptr, " %lf", g_staticParams.staticModifications.pdStaticMods[x]); + fprintf(fptr, " %lf", g_staticParams.staticModifications.dAddNterminusPeptide); + fprintf(fptr, " %lf", g_staticParams.staticModifications.dAddCterminusPeptide); + fprintf(fptr, " %lf", g_staticParams.staticModifications.dAddNterminusProtein); + fprintf(fptr, " %lf\n", g_staticParams.staticModifications.dAddCterminusProtein); + + // write out variable mod params + fprintf(fptr, "VariableMod:"); + for (int x = 0; x < VMODS; x++) + { + fprintf(fptr, " %s %lf:%lf", g_staticParams.variableModParameters.varModList[x].szVarModChar, + g_staticParams.variableModParameters.varModList[x].dVarModMass, + g_staticParams.variableModParameters.varModList[x].dNeutralLoss); + } + fprintf(fptr, "\n\n"); + + // Now write out: vector> g_pvProteinsList + comet_fileoffset_t clProteinsFilePos = comet_ftell(fptr); + size_t tTmp = g_pvProteinsList.size(); + fwrite(&tTmp, clSizeCometFileOffset, 1, fptr); + for (auto it = g_pvProteinsList.begin(); it != g_pvProteinsList.end(); ++it) + { + tTmp = (*it).size(); + fwrite(&tTmp, sizeof(size_t), 1, fptr); + for (size_t it2 = 0; it2 < tTmp; ++it2) + { + fwrite(&((*it).at(it2)), clSizeCometFileOffset, 1, fptr); + } + } + + // next write out the peptides and track peptide mass index + int iMaxPeptideMass = (int)(g_staticParams.options.dPeptideMassHigh); + int iMaxPeptideMass10 = iMaxPeptideMass * 10; // make mass index at resolution of 0.1 Da + comet_fileoffset_t *lIndex = new comet_fileoffset_t[iMaxPeptideMass10 + 1]; + for (int x = 0; x <= iMaxPeptideMass10; x++) + lIndex[x] = -1; + + // write out peptide entry here + int iPrevMass10 = 0; + for (std::vector::iterator it = g_pvDBIndex.begin(); it != g_pvDBIndex.end(); ++it) + { + if ((int)((*it).dPepMass * 10.0) > iPrevMass10) + { + iPrevMass10 = (int)((*it).dPepMass * 10.0); + if (iPrevMass10 < iMaxPeptideMass10) + lIndex[iPrevMass10] = comet_ftell(fptr); + } + + int iLen = (int)strlen((*it).szPeptide); + fwrite(&iLen, sizeof(int), 1, fptr); + fwrite((*it).szPeptide, sizeof(char), iLen, fptr); +// fwrite((*it).szPrevNextAA, sizeof(char), 2, fptr); + + // write out for char 0=no mod, N=mod. If N, write out var mods as N pairs (pos,whichmod) + int iLen2 = iLen + 2; + unsigned char cNumMods = 0; + for (unsigned char x=0; x 0) + { + for (unsigned char x=0; xszPeptide, sizeof(char), iLen, fp); + sDBI->szPeptide[iLen] = '\0'; + + unsigned char cNumMods; // number of var mods encoded as position:residue pairs + tTmp = fread(&cNumMods, sizeof(unsigned char), 1, fp); // read how many var mods are stored + + memset(sDBI->pcVarModSites, 0, sizeof(unsigned char)*iLen+2); + if (cNumMods > 0) + { + for (unsigned char x=0; xpcVarModSites[(int)cPosition] = cResidue; + } + } + // done reading mod sites + + tTmp = fread(&(sDBI->dPepMass), sizeof(double), 1, fp); + tTmp = fread(&(sDBI->lIndexProteinFilePosition), sizeof(comet_fileoffset_t), 1, fp); +} diff --git a/CometSearch/CometPeptideIndex.h b/CometSearch/CometPeptideIndex.h new file mode 100644 index 00000000..a9630b9e --- /dev/null +++ b/CometSearch/CometPeptideIndex.h @@ -0,0 +1,53 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef _COMETPEPTIDEINDEX_H_ +#define _COMETPEPTIDEINDEX_H_ + +#include "Common.h" +//#include "CometDataInternal.h" +#include "CometPostAnalysis.h" +#include "CometSearch.h" +#include "CometStatus.h" +#include "CometMassSpecUtils.h" +#include "CometFragmentIndex.h" +#include "ThreadPool.h" + + + +class CometPeptideIndex +{ +public: + CometPeptideIndex(); + ~CometPeptideIndex(); + + static bool WritePeptideIndex(ThreadPool *tp); + static void ReadPeptideIndexEntry(struct DBIndex *sDBI, FILE *fp); + +private: + +/* + unsigned int _uiBinnedIonMasses[MAX_FRAGMENT_CHARGE + 1][NUM_ION_SERIES][MAX_PEPTIDE_LEN][VMODS + 1]; + unsigned int _uiBinnedIonMassesDecoy[MAX_FRAGMENT_CHARGE + 1][NUM_ION_SERIES][MAX_PEPTIDE_LEN][VMODS + 1]; + unsigned int _uiBinnedPrecursorNL[MAX_PRECURSOR_NL_SIZE][MAX_PRECURSOR_CHARGE]; + unsigned int _uiBinnedPrecursorNLDecoy[MAX_PRECURSOR_NL_SIZE][MAX_PRECURSOR_CHARGE]; +*/ + static bool *_pbSearchMemoryPool; // Pool of memory to be shared by search threads + static bool **_ppbDuplFragmentArr; // Number of arrays equals number of threads + + static Mutex _vFragmentPeptidesMutex; +}; + +#endif // _COMETPEPTIDEINDEX_H_ From 09dbe8bb02ad8042801dd16154e9e40325b385ea Mon Sep 17 00:00:00 2001 From: jke000 Date: Wed, 11 Dec 2024 16:46:16 -0800 Subject: [PATCH 04/18] use proper mutex to protect g_pvDBIndex and fix capturing proper protein file position for peptide index search --- CometSearch/CometPeptideIndex.cpp | 1 - CometSearch/CometSearch.cpp | 9 +++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CometSearch/CometPeptideIndex.cpp b/CometSearch/CometPeptideIndex.cpp index 9118beb1..3038bdcb 100644 --- a/CometSearch/CometPeptideIndex.cpp +++ b/CometSearch/CometPeptideIndex.cpp @@ -60,7 +60,6 @@ bool CometPeptideIndex::WritePeptideIndex(ThreadPool *tp) if (bSucceeded) { - printf("OK before RunSearch\n"); bSucceeded = CometSearch::RunSearch(0, 0, tp); } diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index efb5118c..d0bef469 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -1785,6 +1785,10 @@ bool CometSearch::SearchPeptideIndex(void) comet_fseek(fp, lReadIndex[iStart10], SEEK_SET); CometPeptideIndex::ReadPeptideIndexEntry(&sDBI, fp); + // only use of dbe here is to store the protein position; used for backwards + // compatibility with standard search in StorePeptide + dbe.lProteinFilePosition = sDBI.lIndexProteinFilePosition; + while ((int)(sDBI.dPepMass * 10) <= iEnd10) { /* @@ -1818,6 +1822,7 @@ bool CometSearch::SearchPeptideIndex(void) break; CometPeptideIndex::ReadPeptideIndexEntry(&sDBI, fp); + dbe.lProteinFilePosition = sDBI.lIndexProteinFilePosition; // read past last entry in indexed db, need to break out of loop if (feof(fp)) @@ -6732,7 +6737,7 @@ bool CometSearch::MergeVarMods(char *szProteinSeq, { if (g_staticParams.options.bCreatePeptideIndex) { - Threading::LockMutex(g_pvQueryMutex); + Threading::LockMutex(g_pvDBIndexMutex); // add to DBIndex vector DBIndex sDBTmp; @@ -6749,7 +6754,7 @@ bool CometSearch::MergeVarMods(char *szProteinSeq, g_pvDBIndex.push_back(sDBTmp); - Threading::UnlockMutex(g_pvQueryMutex); + Threading::UnlockMutex(g_pvDBIndexMutex); } else { From 7f67b5b14b7efcd021f00260bd7290a81d9b4f19 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Wed, 11 Dec 2024 22:38:54 -0800 Subject: [PATCH 05/18] update VS project with CometPeptideIndex.cpp and .h files --- CometSearch/CometSearch.vcxproj | 2 ++ CometSearch/CometSearch.vcxproj.filters | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/CometSearch/CometSearch.vcxproj b/CometSearch/CometSearch.vcxproj index 7577b2fb..e6c8c3f1 100644 --- a/CometSearch/CometSearch.vcxproj +++ b/CometSearch/CometSearch.vcxproj @@ -177,6 +177,7 @@ + @@ -199,6 +200,7 @@ + diff --git a/CometSearch/CometSearch.vcxproj.filters b/CometSearch/CometSearch.vcxproj.filters index 834ed90c..d2d17e36 100644 --- a/CometSearch/CometSearch.vcxproj.filters +++ b/CometSearch/CometSearch.vcxproj.filters @@ -84,6 +84,9 @@ Header Files + + Header Files + @@ -134,5 +137,8 @@ Source Files + + Source Files + \ No newline at end of file From 46306306d4a46f4ee6c54734210c3ad027cb8134 Mon Sep 17 00:00:00 2001 From: jke000 Date: Thu, 12 Dec 2024 11:25:13 -0800 Subject: [PATCH 06/18] remove -fconcepts from Makefile in hopes that macOS runner builds --- CometSearch/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CometSearch/Makefile b/CometSearch/Makefile index f611e93d..71f529e2 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,9 +14,9 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 else - override CXXFLAGS += -O3 -static -fconcepts -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 endif From 607e370f1d833daee30145a4d7c786c406f2f7c0 Mon Sep 17 00:00:00 2001 From: jke000 Date: Thu, 12 Dec 2024 13:45:01 -0800 Subject: [PATCH 07/18] Makefile change to see if macOS compile works; complaint about no template named 'function' in namespace 'std' even with included in Threadpool.h --- CometSearch/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 71f529e2..9f5a59a9 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,7 +14,7 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static -std=c++11 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 else override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 From 79600929023fbac31d3499e42299d4bdbbb6a0d8 Mon Sep 17 00:00:00 2001 From: jke000 Date: Thu, 12 Dec 2024 13:50:00 -0800 Subject: [PATCH 08/18] another test with functional --- Comet.cpp | 1 + CometSearch/Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Comet.cpp b/Comet.cpp index acb66055..a5776fd0 100644 --- a/Comet.cpp +++ b/Comet.cpp @@ -19,6 +19,7 @@ #include "CometInterfaces.h" #include +#include using namespace CometInterfaces; diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 9f5a59a9..71f529e2 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,7 +14,7 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -static -std=c++11 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 else override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 From 389e74b12f517ff6c782432b046e79f01a97265f Mon Sep 17 00:00:00 2001 From: jke000 Date: Thu, 12 Dec 2024 13:55:21 -0800 Subject: [PATCH 09/18] yet another functional try --- Comet.cpp | 1 - CometSearch/Common.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/Comet.cpp b/Comet.cpp index a5776fd0..acb66055 100644 --- a/Comet.cpp +++ b/Comet.cpp @@ -19,7 +19,6 @@ #include "CometInterfaces.h" #include -#include using namespace CometInterfaces; diff --git a/CometSearch/Common.h b/CometSearch/Common.h index be0f7aa8..41a6cf40 100644 --- a/CometSearch/Common.h +++ b/CometSearch/Common.h @@ -64,6 +64,7 @@ using namespace std; #include #include #include +#include #ifndef GITHUBSHA // value passed thru at compile time #define GITHUBSHA "" From 339145c31238af1f8370cfac3b1c8a81bd6209bb Mon Sep 17 00:00:00 2001 From: jke000 Date: Thu, 12 Dec 2024 14:26:24 -0800 Subject: [PATCH 10/18] another test --- CometSearch/Makefile | 2 +- CometSearch/ThreadPool.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 71f529e2..ef17a1ab 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,7 +14,7 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static --std=c++11 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 else override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 diff --git a/CometSearch/ThreadPool.h b/CometSearch/ThreadPool.h index 1714c460..e97a3293 100644 --- a/CometSearch/ThreadPool.h +++ b/CometSearch/ThreadPool.h @@ -24,7 +24,6 @@ #include #include #include - #include #ifdef TPP_WIN32THREADS From ef37653f39a1a2391c4858f04cbb7602e1a745b4 Mon Sep 17 00:00:00 2001 From: jke000 Date: Fri, 13 Dec 2024 11:03:29 -0800 Subject: [PATCH 11/18] another test --- CometSearch/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CometSearch/Makefile b/CometSearch/Makefile index ef17a1ab..4d859c8e 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,7 +14,7 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -static --std=c++11 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static --std=c++14 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 else override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 From 4667e866e371f2cc7a4a109612cdd7f33cead5c0 Mon Sep 17 00:00:00 2001 From: jke000 Date: Fri, 13 Dec 2024 11:09:07 -0800 Subject: [PATCH 12/18] another test --- CometSearch/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 4d859c8e..408b458c 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -16,7 +16,7 @@ UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) override CXXFLAGS += -O3 -static --std=c++14 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 else - override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static --std=c++14 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 endif From 207927307933b0aeac69cd055ad22a14886055bc Mon Sep 17 00:00:00 2001 From: jke000 Date: Fri, 13 Dec 2024 11:11:13 -0800 Subject: [PATCH 13/18] another test --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 9d002a6d..912c9140 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,9 @@ COMETSEARCH = CometSearch UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -fpermissive -Wall -Wextra -Wno-char-subscripts -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/src/expat-2.2.9/lib -I$(MSTOOLKIT)/src/zlib-1.2.11 -I$(COMETSEARCH) + override CXXFLAGS += -O3 -std=c++14 -fpermissive -Wall -Wextra -Wno-char-subscripts -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/src/expat-2.2.9/lib -I$(MSTOOLKIT)/src/zlib-1.2.11 -I$(COMETSEARCH) else - override CXXFLAGS += -O3 -static -fpermissive -Wall -Wextra -Wno-char-subscripts -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/src/expat-2.2.9/lib -I$(MSTOOLKIT)/src/zlib-1.2.11 -I$(COMETSEARCH) + override CXXFLAGS += -O3 -static -std=c++14 -fpermissive -Wall -Wextra -Wno-char-subscripts -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/src/expat-2.2.9/lib -I$(MSTOOLKIT)/src/zlib-1.2.11 -I$(COMETSEARCH) endif EXECNAME = comet.exe From 7472e247ca38955bf1db8c28df885cad6feb891b Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Wed, 18 Dec 2024 09:56:25 -0800 Subject: [PATCH 14/18] Add CreatePeptideIndex and CreateFragmentIndex to CometWrapper. --- CometSearch/CometInterfaces.h | 20 ++++++++++-------- CometSearch/CometPostAnalysis.cpp | 17 +++++++++------ CometSearch/CometSearch.cpp | 9 +++----- CometSearch/CometSearch.h | 2 +- CometSearch/CometSearchManager.cpp | 34 +++++++++++++++--------------- CometSearch/CometSearchManager.h | 2 ++ CometWrapper/CometWrapper.cpp | 13 ++++++++++-- CometWrapper/CometWrapper.h | 3 ++- 8 files changed, 57 insertions(+), 43 deletions(-) diff --git a/CometSearch/CometInterfaces.h b/CometSearch/CometInterfaces.h index c3d53a4d..473d63af 100644 --- a/CometSearch/CometInterfaces.h +++ b/CometSearch/CometInterfaces.h @@ -29,6 +29,8 @@ namespace CometInterfaces { public: virtual ~ICometSearchManager() {} + virtual bool CreateFragmentIndex() = 0; + virtual bool CreatePeptideIndex() = 0; virtual bool DoSearch() = 0; virtual bool InitializeSingleSpectrumSearch() = 0; virtual void FinalizeSingleSpectrumSearch() = 0; @@ -42,15 +44,15 @@ namespace CometInterfaces vector & matchedFragments, Scores & scores) = 0; virtual bool DoSingleSpectrumSearchMultiResults(const int topN, - const int iPrecursorCharge, - const double dMZ, - double* dMass, - double* dInten, - const int iNumPeaks, - vector& strReturnPeptide, - vector& strReturnProtein, - vector>& matchedFragments, - vector& scores) = 0; + const int iPrecursorCharge, + const double dMZ, + double* dMass, + double* dInten, + const int iNumPeaks, + vector& strReturnPeptide, + vector& strReturnProtein, + vector>& matchedFragments, + vector& scores) = 0; virtual void AddInputFiles(vector &pvInputFiles) = 0; virtual void SetOutputFileBaseName(const char *pszBaseName) = 0; virtual void SetParam(const string &name, const string &strValue, const string &value) = 0; diff --git a/CometSearch/CometPostAnalysis.cpp b/CometSearch/CometPostAnalysis.cpp index 65ada640..189b7e21 100644 --- a/CometSearch/CometPostAnalysis.cpp +++ b/CometSearch/CometPostAnalysis.cpp @@ -41,15 +41,18 @@ bool CometPostAnalysis::PostAnalysis(ThreadPool* tp) for (int i=0; i<(int)g_pvQuery.size(); ++i) { - PostAnalysisThreadData *pThreadData = new PostAnalysisThreadData(i); + if (g_pvQuery.at(i)->iMatchPeptideCount > 0 || g_pvQuery.at(i)->iDecoyMatchPeptideCount > 0) + { + PostAnalysisThreadData* pThreadData = new PostAnalysisThreadData(i); - pPostAnalysisThreadPool->doJob(std::bind(PostAnalysisThreadProc, pThreadData, pPostAnalysisThreadPool)); + pPostAnalysisThreadPool->doJob(std::bind(PostAnalysisThreadProc, pThreadData, pPostAnalysisThreadPool)); - pThreadData = NULL; - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - { - break; + pThreadData = NULL; + bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); + if (!bSucceeded) + { + break; + } } } diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index d0bef469..bf886dad 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -1808,15 +1808,14 @@ bool CometSearch::SearchPeptideIndex(void) if (sDBI.dPepMass > g_massRange.dMaxMass) break; - CometSearch cs; - int iWhichQuery = cs.BinarySearchMass(0, (int)g_pvQuery.size(), sDBI.dPepMass); + int iWhichQuery = BinarySearchMass(0, (int)g_pvQuery.size(), sDBI.dPepMass); while (iWhichQuery > 0 && g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= sDBI.dPepMass) iWhichQuery--; // Do the search if (iWhichQuery != -1) - cs.AnalyzePeptideIndex(iWhichQuery, sDBI, _ppbDuplFragmentArr[0], &dbe); + AnalyzePeptideIndex(iWhichQuery, sDBI, _ppbDuplFragmentArr[0], &dbe); if (comet_ftell(fp)>=lEndOfStruct || sDBI.dPepMass>g_massRange.dMaxMass) break; @@ -1918,8 +1917,6 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, double _pdAAforwardDecoy[MAX_PEPTIDE_LEN]; // Stores fragment ion fragment ladder calc.; sum AA masses including mods double _pdAAreverseDecoy[MAX_PEPTIDE_LEN]; // Stores n-term fragment ion fragment ladder calc.; sum AA masses including mods - CometSearch cs; - if (g_staticParams.variableModParameters.bUseFragmentNeutralLoss) { int i; @@ -1965,7 +1962,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, } // Mass tolerance check for particular query against this candidate peptide mass. - if (cs.CheckMassMatch(iWhichQuery, sDBI.dPepMass)) + if (CheckMassMatch(iWhichQuery, sDBI.dPepMass)) { char szDecoyPeptide[MAX_PEPTIDE_LEN]; int piVarModSites[MAX_PEPTIDE_LEN_P2]; // forward mods, generated from sDBI.sVarModSites diff --git a/CometSearch/CometSearch.h b/CometSearch/CometSearch.h index 6558bea5..42d7160d 100644 --- a/CometSearch/CometSearch.h +++ b/CometSearch/CometSearch.h @@ -239,7 +239,7 @@ class CometSearch struct sDBEntry *dbe); static void SearchFragmentIndex(size_t iWhichQuery, ThreadPool *tp); - static bool SearchPeptideIndex(void); + bool SearchPeptideIndex(void); void AnalyzePeptideIndex(int iWhichQuery, DBIndex sDBI, bool *pbDuplFragment, diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index d748dfbe..7d296d44 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -2037,6 +2037,23 @@ void CometSearchManager::ResetSearchStatus() g_cometStatus.ResetStatus(); } +bool CometSearchManager::CreateFragmentIndex() +{ + // Override the Create Index flag to force it to create + g_staticParams.options.bCreateFragmentIndex = 1; + + // The DoSearch will create the index and exit + return DoSearch(); +} + +bool CometSearchManager::CreatePeptideIndex() +{ + // Override the Create Index flag to force it to create + g_staticParams.options.bCreatePeptideIndex = 1; + + // The DoSearch will create the index and exit + return DoSearch(); +} bool CometSearchManager::DoSearch() { @@ -3182,23 +3199,6 @@ bool CometSearchManager::InitializeSingleSpectrumSearch() sqSearch.CreateFragmentIndex(tp); } -/* now done in ValidateSequenceDatabaseFile() */ -/* - // open FASTA for retrieving protein names - string sTmpDB = g_staticParams.databaseInfo.szDatabase; - if (!strcmp(g_staticParams.databaseInfo.szDatabase + strlen(g_staticParams.databaseInfo.szDatabase) - 4, ".idx")) - sTmpDB = sTmpDB.erase(sTmpDB.size() - 4); // need plain fasta if indexdb input - if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == NULL) - { - char szErrorMsg[SIZE_ERROR]; - sprintf(szErrorMsg, " Error (3) - cannot read database file \"%s\".\n", sTmpDB.c_str()); - string strErrorMsg(szErrorMsg); - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(szErrorMsg); - return false; - } -*/ - singleSearchInitializationComplete = true; return true; diff --git a/CometSearch/CometSearchManager.h b/CometSearch/CometSearchManager.h index f5466824..75f6615c 100644 --- a/CometSearch/CometSearchManager.h +++ b/CometSearch/CometSearchManager.h @@ -50,6 +50,8 @@ class CometSearchManager : public ICometSearchManager std::map& GetParamsMap(); // Methods inherited from ICometSearchManager + virtual bool CreateFragmentIndex(); + virtual bool CreatePeptideIndex(); virtual bool DoSearch(); virtual bool InitializeSingleSpectrumSearch(); virtual void FinalizeSingleSpectrumSearch(); diff --git a/CometWrapper/CometWrapper.cpp b/CometWrapper/CometWrapper.cpp index 30b2e9db..e2921114 100644 --- a/CometWrapper/CometWrapper.cpp +++ b/CometWrapper/CometWrapper.cpp @@ -52,13 +52,22 @@ CometSearchManagerWrapper::~CometSearchManagerWrapper() } } -bool CometSearchManagerWrapper::CreateIndex() +bool CometSearchManagerWrapper::CreatePeptideIndex() { if (!_pSearchMgr) { return false; } - return _pSearchMgr->CreateIndex(); + return _pSearchMgr->CreatePeptideIndex(); +} + +bool CometSearchManagerWrapper::CreateFragmentIndex() +{ + if (!_pSearchMgr) + { + return false; + } + return _pSearchMgr->CreateFragmentIndex(); } bool CometSearchManagerWrapper::InitializeSingleSpectrumSearch() diff --git a/CometWrapper/CometWrapper.h b/CometWrapper/CometWrapper.h index 1f0fa191..82200ef8 100644 --- a/CometWrapper/CometWrapper.h +++ b/CometWrapper/CometWrapper.h @@ -35,7 +35,8 @@ namespace CometWrapper { CometSearchManagerWrapper(); virtual ~CometSearchManagerWrapper(); - bool CreateIndex(); + bool CreateFragmentIndex(); + bool CreatePeptideIndex(); bool DoSearch(); bool InitializeSingleSpectrumSearch(); void FinalizeSingleSpectrumSearch(); From a113c41dd382df8bd169ce8e0942188c34e5d985 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Mon, 23 Dec 2024 15:18:27 -0800 Subject: [PATCH 15/18] Get peptide index working for DoSingleSpectrumSearchMultiResults call thru CometWrapper. --- CometSearch/CometFragmentIndex.h | 4 +-- CometSearch/CometPeptideIndex.cpp | 9 ++++--- CometSearch/CometPreprocess.cpp | 9 +++---- CometSearch/CometSearch.cpp | 32 ++++++++++++++++-------- CometSearch/CometSearchManager.cpp | 40 +++++++++++++++++++++++++++++- RealtimeSearch/Search.cs | 2 ++ 6 files changed, 74 insertions(+), 22 deletions(-) diff --git a/CometSearch/CometFragmentIndex.h b/CometSearch/CometFragmentIndex.h index c457762f..810047c1 100644 --- a/CometSearch/CometFragmentIndex.h +++ b/CometSearch/CometFragmentIndex.h @@ -55,12 +55,12 @@ class CometFragmentIndex unsigned int y); static void SortFragmentThreadProc(int iWhichThread, ThreadPool* tp); - +/* unsigned int _uiBinnedIonMasses[MAX_FRAGMENT_CHARGE + 1][NUM_ION_SERIES][MAX_PEPTIDE_LEN][VMODS + 1]; unsigned int _uiBinnedIonMassesDecoy[MAX_FRAGMENT_CHARGE + 1][NUM_ION_SERIES][MAX_PEPTIDE_LEN][VMODS + 1]; unsigned int _uiBinnedPrecursorNL[MAX_PRECURSOR_NL_SIZE][MAX_PRECURSOR_CHARGE]; unsigned int _uiBinnedPrecursorNLDecoy[MAX_PRECURSOR_NL_SIZE][MAX_PRECURSOR_CHARGE]; - +*/ static bool *_pbSearchMemoryPool; // Pool of memory to be shared by search threads static bool **_ppbDuplFragmentArr; // Number of arrays equals number of threads diff --git a/CometSearch/CometPeptideIndex.cpp b/CometSearch/CometPeptideIndex.cpp index 3038bdcb..0057416d 100644 --- a/CometSearch/CometPeptideIndex.cpp +++ b/CometSearch/CometPeptideIndex.cpp @@ -172,14 +172,15 @@ bool CometPeptideIndex::WritePeptideIndex(ThreadPool *tp) fprintf(fptr, "Comet peptide index database. Comet version %s\n", g_sCometVersion.c_str()); fprintf(fptr, "InputDB: %s\n", g_staticParams.databaseInfo.szDatabase); fprintf(fptr, "MassRange: %lf %lf\n", g_staticParams.options.dPeptideMassLow, g_staticParams.options.dPeptideMassHigh); + fprintf(fptr, "LengthRange: %d %d\n", g_staticParams.options.peptideLengthRange.iStart, g_staticParams.options.peptideLengthRange.iEnd); fprintf(fptr, "MassType: %d %d\n", g_staticParams.massUtility.bMonoMassesParent, g_staticParams.massUtility.bMonoMassesFragment); fprintf(fptr, "Enzyme: %s [%d %s %s]\n", g_staticParams.enzymeInformation.szSearchEnzymeName, - g_staticParams.enzymeInformation.iSearchEnzymeOffSet, - g_staticParams.enzymeInformation.szSearchEnzymeBreakAA, + g_staticParams.enzymeInformation.iSearchEnzymeOffSet, + g_staticParams.enzymeInformation.szSearchEnzymeBreakAA, g_staticParams.enzymeInformation.szSearchEnzymeNoBreakAA); fprintf(fptr, "Enzyme2: %s [%d %s %s]\n", g_staticParams.enzymeInformation.szSearchEnzyme2Name, - g_staticParams.enzymeInformation.iSearchEnzyme2OffSet, - g_staticParams.enzymeInformation.szSearchEnzyme2BreakAA, + g_staticParams.enzymeInformation.iSearchEnzyme2OffSet, + g_staticParams.enzymeInformation.szSearchEnzyme2BreakAA, g_staticParams.enzymeInformation.szSearchEnzyme2NoBreakAA); fprintf(fptr, "NumPeptides: %ld\n", (long)g_pvDBIndex.size()); diff --git a/CometSearch/CometPreprocess.cpp b/CometSearch/CometPreprocess.cpp index fd3c7091..79ab7011 100644 --- a/CometSearch/CometPreprocess.cpp +++ b/CometSearch/CometPreprocess.cpp @@ -1960,10 +1960,6 @@ bool CometPreprocess::PreprocessSingleSpectrum(int iPrecursorCharge, pScoring->_spectrumInfoInternal.iChargeState = iPrecursorCharge; - g_massRange.dMinMass = pScoring->_pepMassInfo.dExpPepMass; - g_massRange.dMaxMass = pScoring->_pepMassInfo.dExpPepMass; - g_massRange.iMaxFragmentCharge = pScoring->_spectrumInfoInternal.iMaxFragCharge; - if (iPrecursorCharge == 1) pScoring->_spectrumInfoInternal.iMaxFragCharge = 1; else @@ -1974,6 +1970,10 @@ bool CometPreprocess::PreprocessSingleSpectrum(int iPrecursorCharge, pScoring->_spectrumInfoInternal.iMaxFragCharge = g_staticParams.options.iMaxFragmentCharge; } + g_massRange.dMinMass = pScoring->_pepMassInfo.dExpPepMass; + g_massRange.dMaxMass = pScoring->_pepMassInfo.dExpPepMass; + g_massRange.iMaxFragmentCharge = pScoring->_spectrumInfoInternal.iMaxFragCharge; + //preprocess here int i; int x; @@ -2012,7 +2012,6 @@ bool CometPreprocess::PreprocessSingleSpectrum(int iPrecursorCharge, pScoring->_spectrumInfoInternal.iArraySize = (int)((pScoring->_pepMassInfo.dExpPepMass + dCushion + 2.0) * g_staticParams.dInverseBinWidth); - // initialize these temporary arrays before re-using size_t iTmp= (size_t)(pScoring->_spectrumInfoInternal.iArraySize)*sizeof(double); diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index bf886dad..81a1c38e 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -1602,8 +1602,7 @@ bool CometSearch::SearchPeptideIndex(void) { sscanf(szBuf, "%d %d", &g_staticParams.massUtility.bMonoMassesParent, &g_staticParams.massUtility.bMonoMassesFragment); } - - if (!strncmp(szBuf, "StaticMod:", 10)) + else if (!strncmp(szBuf, "StaticMod:", 10)) { char *tok; char delims[] = " "; @@ -1644,8 +1643,21 @@ bool CometSearch::SearchPeptideIndex(void) + g_staticParams.staticModifications.dAddCterminusPeptide + g_staticParams.staticModifications.dAddNterminusPeptide; } - - if (!strncmp(szBuf, "VariableMod:", 12)) + else if (!strncmp(szBuf, "Enzyme:", 7)) + { + sscanf(szBuf, "Enzyme: %s [%d %s %s]", g_staticParams.enzymeInformation.szSearchEnzymeName, + &(g_staticParams.enzymeInformation.iSearchEnzymeOffSet), + g_staticParams.enzymeInformation.szSearchEnzymeBreakAA, + g_staticParams.enzymeInformation.szSearchEnzymeNoBreakAA); + } + else if (!strncmp(szBuf, "Enzyme2:", 8)) + { + sscanf(szBuf, "Enzyme2: %s [%d %s %s]", g_staticParams.enzymeInformation.szSearchEnzyme2Name, + &(g_staticParams.enzymeInformation.iSearchEnzyme2OffSet), + g_staticParams.enzymeInformation.szSearchEnzyme2BreakAA, + g_staticParams.enzymeInformation.szSearchEnzyme2NoBreakAA); + } + else if (!strncmp(szBuf, "VariableMod:", 12)) { char *tok; char delims[] = " "; @@ -1817,7 +1829,7 @@ bool CometSearch::SearchPeptideIndex(void) if (iWhichQuery != -1) AnalyzePeptideIndex(iWhichQuery, sDBI, _ppbDuplFragmentArr[0], &dbe); - if (comet_ftell(fp)>=lEndOfStruct || sDBI.dPepMass>g_massRange.dMaxMass) + if (comet_ftell(fp) >= lEndOfStruct || sDBI.dPepMass > g_massRange.dMaxMass) break; CometPeptideIndex::ReadPeptideIndexEntry(&sDBI, fp); @@ -1837,8 +1849,7 @@ bool CometSearch::SearchPeptideIndex(void) } } -// vector::iterator it = g_pvQuery.begin(); - for (vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) // g_pvQuery is always size 1 here; for loop is useless + for (vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) { int iNumMatchedPeptides; @@ -4135,7 +4146,7 @@ void CometSearch::XcorrScore(char *szProteinSeq, dCalcPepMass, dXcorr, bDecoyPep, piVarModSites, dbe); } else if (!CheckDuplicate(iWhichQuery, iStartResidue, iEndResidue, iStartPos, iEndPos, iFoundVariableMod, dCalcPepMass, - szProteinSeq, bDecoyPep, piVarModSites, dbe)) + szProteinSeq, bDecoyPep, piVarModSites, dbe)) { StorePeptide(iWhichQuery, iStartResidue, iStartPos, iEndPos, iFoundVariableMod, szProteinSeq, dCalcPepMass, dXcorr, bDecoyPep, piVarModSites, dbe); @@ -4550,7 +4561,8 @@ void CometSearch::StorePeptide(int iWhichQuery, { siLowestXcorrScoreIndex = siA; } - else if (pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr == pQuery->_pResults[siA].fXcorr) + else if (pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr == pQuery->_pResults[siA].fXcorr + && pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr > XCORR_CUTOFF) { // if current lowest score is the same as current siA peptide, // determine if need to point to siA peptide as the one to replace @@ -4613,7 +4625,7 @@ void CometSearch::StorePeptide(int iWhichQuery, memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide, szProteinSeq+iStartPos, iLenPeptide*sizeof(char)); pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide[iLenPeptide]='\0'; pQuery->_pResults[siLowestXcorrScoreIndex].dPepMass = dCalcPepMass; - + if (pQuery->_spectrumInfoInternal.iChargeState > 2) { pQuery->_pResults[siLowestXcorrScoreIndex].iTotalIons = (iLenPeptide - 1) diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 7d296d44..792e7630 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -2078,6 +2078,18 @@ bool CometSearchManager::DoSearch() bool bSucceeded = true; + #ifdef PERF_DEBUG + // print set search parameters + std::map mapParams = GetParamsMap(); + for (std::map::iterator it = mapParams.begin(); it != mapParams.end(); ++it) + { + if (it->first != "[COMET_ENZYME_INFO]") + { + printf("OK parameter name=\"%s\" value=\"%s\"\n", it->first.c_str(), it->second->GetStringValue().c_str()); + } + } +#endif + // add git hash to version string if present // repeated here from Comet main() as main() is skipped when search invoked via DLL if (strlen(GITHUBSHA) > 0) @@ -3152,6 +3164,10 @@ bool CometSearchManager::DoSearch() printf(" - done.\n\n"); } + else if (g_staticParams.iIndexDb == 2) + { + printf(" - done.\n\n"); + } if (bBlankSearchFile) return false; @@ -3193,12 +3209,19 @@ bool CometSearchManager::InitializeSingleSpectrumSearch() // Load databases CometFragmentIndex sqSearch; - if (!g_bPlainPeptideIndexRead) + if (g_staticParams.iIndexDb == 1 && !g_bPlainPeptideIndexRead) { sqSearch.ReadPlainPeptideIndex(); sqSearch.CreateFragmentIndex(tp); } +/* FIX: need to add this functionality; how to specify PeptideIndex though? + else if (g_staticParams.iIndexDb == 2 && !g_PeptideIndexRead) + { + sqSearch.CreatePeptideIndex(tp); + } +*/ + singleSearchInitializationComplete = true; return true; @@ -3587,9 +3610,24 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, if (!InitializeSingleSpectrumSearch()) return false; + // tRealTimeStart used to track elapsed search time and to exit if g_staticParams.options.iMaxIndexRunTime is surpased + g_staticParams.tRealTimeStart = std::chrono::high_resolution_clock::now(); + // We need to reset some of the static variables in-between input files CometPreprocess::Reset(); +#ifdef PERF_DEBUG + // print set search parameters + std::map mapParams = GetParamsMap(); + for (std::map::iterator it = mapParams.begin(); it != mapParams.end(); ++it) + { + if (it->first != "[COMET_ENZYME_INFO]") + { + printf("OK parameter name=\"%s\" value=\"%s\"\n", it->first.c_str(), it->second->GetStringValue().c_str()); + } + } +#endif + // IMPORTANT: From this point onwards, because we've loaded some // spectra, we MUST "goto cleanup_results" before exiting the loop, // or we will create a memory leak! diff --git a/RealtimeSearch/Search.cs b/RealtimeSearch/Search.cs index 885fab4c..45157e94 100644 --- a/RealtimeSearch/Search.cs +++ b/RealtimeSearch/Search.cs @@ -126,6 +126,7 @@ static void Main(string[] args) { double dTmp = double.Parse(trailerData.Values[i]); double dMassDiff = Math.Abs(dTmp - dPrecursorMZ); + if (dTmp != 0.0 && dMassDiff < 10.0) dPrecursorMZ = dTmp; } @@ -274,6 +275,7 @@ public bool ConfigureInputSettings(CometSearchManagerWrapper SearchMgr, sTmp = dTmp.ToString(); SearchMgr.SetParam("fragment_bin_tol", sTmp, dTmp); + dTmp = 0.0; // fragment bin offst sTmp = dTmp.ToString(); SearchMgr.SetParam("fragment_bin_offset", sTmp, dTmp); From 2ee25f1ce1722747c749e686c26fd009073027e0 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Fri, 27 Dec 2024 15:48:53 -0800 Subject: [PATCH 16/18] address memory leak in GetPrevNextAA that showed up with peptide index RTS --- CometSearch/CometMassSpecUtils.cpp | 34 +++-- CometSearch/CometPostAnalysis.cpp | 2 +- CometSearch/CometSearch.cpp | 5 +- CometSearch/CometSearchManager.cpp | 219 +++++++++++++++-------------- RealtimeSearch/Search.cs | 48 ++++--- 5 files changed, 160 insertions(+), 148 deletions(-) diff --git a/CometSearch/CometMassSpecUtils.cpp b/CometSearch/CometMassSpecUtils.cpp index a51eaed2..82931e0c 100644 --- a/CometSearch/CometMassSpecUtils.cpp +++ b/CometSearch/CometMassSpecUtils.cpp @@ -414,8 +414,9 @@ void CometMassSpecUtils::GetPrevNextAA(FILE *fpfasta, else pOutput[iWhichResult].cNextAA = szSequence[iEndPos + 1]; - bFound = true; - break; + free(szSequence); + strSeq.clear(); + return; } else if (g_staticParams.options.bClipNtermMet && iStartPos == 1 && szSequence[0] == 'M' && cs.CheckEnzymeEndTermini(szSequence, iEndPos)) { @@ -426,15 +427,13 @@ void CometMassSpecUtils::GetPrevNextAA(FILE *fpfasta, else pOutput[iWhichResult].cNextAA = szSequence[iEndPos + 1]; - bFound = true; - break; + free(szSequence); + strSeq.clear(); + return; } ++iStartPos; } - - if (bFound) - break; } else if (iWhichTerm == 1) { @@ -450,8 +449,9 @@ void CometMassSpecUtils::GetPrevNextAA(FILE *fpfasta, else pOutput[iWhichResult].cNextAA = '-'; - bFound = true; - break; + free(szSequence); + strSeq.clear(); + return; } else if (g_staticParams.options.bClipNtermMet && szSequence[0] == 'M' @@ -461,7 +461,10 @@ void CometMassSpecUtils::GetPrevNextAA(FILE *fpfasta, pOutput[iWhichResult].cPrevAA = 'M'; pOutput[iWhichResult].cNextAA = szSequence[iEndPos + 1]; bFound = true; - break; + + free(szSequence); + strSeq.clear(); + return; } } else if (iWhichTerm == 2) @@ -479,20 +482,25 @@ void CometMassSpecUtils::GetPrevNextAA(FILE *fpfasta, pOutput[iWhichResult].cNextAA = '-'; - bFound = true; - break; + free(szSequence); + strSeq.clear(); + return; } else if (g_staticParams.options.bClipNtermMet && iStartPos == 1 && szSequence[0] == 'M') { pOutput[iWhichResult].cPrevAA = 'M'; pOutput[iWhichResult].cNextAA = '-'; bFound = true; - break; + + free(szSequence); + strSeq.clear(); + return; } } } free(szSequence); + strSeq.clear(); } if (!bFound) diff --git a/CometSearch/CometPostAnalysis.cpp b/CometSearch/CometPostAnalysis.cpp index 189b7e21..b05f79a9 100644 --- a/CometSearch/CometPostAnalysis.cpp +++ b/CometSearch/CometPostAnalysis.cpp @@ -370,7 +370,7 @@ void CometPostAnalysis::CalculateSP(Results *pOutput, } } - if (pOutput[i].iLenPeptide > 0 && pOutput[i].fXcorr > XCORR_CUTOFF) // take care of possible edge case + if (pOutput[i].iLenPeptide > 0 && pOutput[i].fXcorr > g_staticParams.options.dMinimumXcorr) // take care of possible edge case { int ii; int ctCharge; diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index 81a1c38e..5cbf5e1b 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -4562,7 +4562,7 @@ void CometSearch::StorePeptide(int iWhichQuery, siLowestXcorrScoreIndex = siA; } else if (pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr == pQuery->_pResults[siA].fXcorr - && pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr > XCORR_CUTOFF) + && pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr > g_staticParams.options.dMinimumXcorr) { // if current lowest score is the same as current siA peptide, // determine if need to point to siA peptide as the one to replace @@ -4717,8 +4717,7 @@ void CometSearch::StorePeptide(int iWhichQuery, if (iVal > 0) { - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] - = g_staticParams.variableModParameters.varModList[iVal-1].dVarModMass; + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[iVal-1].dVarModMass; } else if (iVal < 0) { diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 792e7630..d8bcdbb2 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -2873,7 +2873,7 @@ bool CometSearchManager::DoSearch() for (int iWhichResult = 0; iWhichResult < iNumPrintLines; ++iWhichResult) { - if (g_pvQuery.at(iWhichQuery)->_pResults[iWhichResult].iLenPeptide > 0 && g_pvQuery.at(iWhichQuery)->_pResults[iWhichResult].fXcorr > XCORR_CUTOFF) + if (g_pvQuery.at(iWhichQuery)->_pResults[iWhichResult].iLenPeptide > 0 && g_pvQuery.at(iWhichQuery)->_pResults[iWhichResult].fXcorr > g_staticParams.options.dMinimumXcorr) { int iNtermMod = g_pvQuery.at(iWhichQuery)->_pResults[iWhichResult].piVarModSites[0]; int iCtermMod = g_pvQuery.at(iWhichQuery)->_pResults[iWhichResult].piVarModSites[g_pvQuery.at(iWhichQuery)->_pResults[iWhichResult].iLenPeptide - 1]; @@ -2948,8 +2948,8 @@ bool CometSearchManager::DoSearch() // Deleting each Query object in the vector calls its destructor, which // frees the spectral memory (see definition for Query in CometData.h). - for (std::vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) - delete *it; + for (auto it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) + delete (*it); g_pvQuery.clear(); @@ -3604,7 +3604,7 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, if (iNumPeaks == 0) return false; - if (dMZ * iPrecursorCharge - (iPrecursorCharge - 1) * PROTON_MASS > g_staticParams.options.dPeptideMassHigh) + if (dMZ * iPrecursorCharge - (iPrecursorCharge - 1.0) * PROTON_MASS > g_staticParams.options.dPeptideMassHigh) return false; // this assumes dPeptideMassHigh is set correctly in the calling program if (!InitializeSingleSpectrumSearch()) @@ -3677,6 +3677,7 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, } takeSearchResultsN = topN; // return up to the top N results, or iSize + if (takeSearchResultsN > iSize) takeSearchResultsN = iSize; @@ -3695,20 +3696,20 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, goto cleanup_results; Query* pQuery; - pQuery = g_pvQuery.at(0); // return info for top hit only + pQuery = g_pvQuery.at(0); // there's only a single query spectrum for (int idx = 0; idx < takeSearchResultsN; ++idx) { Scores score; score.dCn = 0; - score.xCorr = 0; + score.xCorr = g_staticParams.options.dMinimumXcorr; score.matchedIons = 0; score.totalIons = 0; std::string eachStrReturnPeptide; std::string eachStrReturnProtein; vector eachMatchedFragments; - if (iSize > 0 && pQuery->_pResults[idx].fXcorr > XCORR_CUTOFF && pQuery->_pResults[idx].iLenPeptide > 0) + if (iSize > 0 && pQuery->_pResults[idx].fXcorr > g_staticParams.options.dMinimumXcorr && pQuery->_pResults[idx].iLenPeptide > 0) { Results* pOutput = pQuery->_pResults; @@ -3721,29 +3722,29 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, // n-term variable mod if (pOutput[idx].piVarModSites[pOutput[idx].iLenPeptide] != 0) { - std::stringstream ss; - ss << "n[" << std::fixed << std::setprecision(4) << pOutput[idx].pdVarModSites[pOutput[idx].iLenPeptide] << "]"; - eachStrReturnPeptide += ss.str(); + std::stringstream ss; + ss << "n[" << std::fixed << std::setprecision(4) << pOutput[idx].pdVarModSites[pOutput[idx].iLenPeptide] << "]"; + eachStrReturnPeptide += ss.str(); } for (int i = 0; i < pOutput[idx].iLenPeptide; ++i) { - eachStrReturnPeptide += pOutput[idx].szPeptide[i]; + eachStrReturnPeptide += pOutput[idx].szPeptide[i]; - if (pOutput[idx].piVarModSites[i] != 0) - { - std::stringstream ss; - ss << "[" << std::fixed << std::setprecision(4) << pOutput[idx].pdVarModSites[i] << "]"; - eachStrReturnPeptide += ss.str(); - } + if (pOutput[idx].piVarModSites[i] != 0) + { + std::stringstream ss; + ss << "[" << std::fixed << std::setprecision(4) << pOutput[idx].pdVarModSites[i] << "]"; + eachStrReturnPeptide += ss.str(); + } } // c-term variable mod if (pOutput[idx].piVarModSites[pOutput[idx].iLenPeptide + 1] != 0) { - std::stringstream ss; - ss << "c[" << std::fixed << std::setprecision(4) << pOutput[idx].pdVarModSites[pOutput[idx].iLenPeptide + 1] << "]"; - eachStrReturnPeptide += ss.str(); + std::stringstream ss; + ss << "c[" << std::fixed << std::setprecision(4) << pOutput[idx].pdVarModSites[pOutput[idx].iLenPeptide + 1] << "]"; + eachStrReturnPeptide += ss.str(); } eachStrReturnPeptide += "." + std::string(1, pOutput[idx].cNextAA); @@ -3754,13 +3755,13 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, szProtein[511] = '\0'; eachStrReturnProtein = szProtein; //protein - score.xCorr = pOutput[idx].fXcorr; // xcorr + score.xCorr = pOutput[idx].fXcorr; // xcorr score.dCn = pOutput[idx].fDeltaCn; // deltaCn score.dSp = pOutput[idx].fScoreSp; // prelim score - score.dExpect = pOutput[idx].dExpect; // E-value - score.mass = pOutput[idx].dPepMass - PROTON_MASS; // calc neutral pep mass - score.matchedIons = pOutput[idx].iMatchedIons; // ions matched - score.totalIons = pOutput[idx].iTotalIons; // ions tot + score.dExpect = pOutput[idx].dExpect; // E-value + score.mass = pOutput[idx].dPepMass - PROTON_MASS; // calc neutral pep mass + score.matchedIons = pOutput[idx].iMatchedIons; // ions matched + score.totalIons = pOutput[idx].iTotalIons; // ions tot int iMinLength = g_staticParams.options.peptideLengthRange.iEnd; for (int x = 0; x < iSize; ++x) @@ -3826,114 +3827,114 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, // Generate pdAAforward for pQuery->_pResults[idx].szPeptide. for (int i = 0; i < pQuery->_pResults[idx].iLenPeptide - 1; ++i) { - int iPos = pQuery->_pResults[idx].iLenPeptide - i - 1; + int iPos = pQuery->_pResults[idx].iLenPeptide - i - 1; - dBion += g_staticParams.massUtility.pdAAMassFragment[(int)pQuery->_pResults[idx].szPeptide[i]]; - dYion += g_staticParams.massUtility.pdAAMassFragment[(int)pQuery->_pResults[idx].szPeptide[iPos]]; + dBion += g_staticParams.massUtility.pdAAMassFragment[(int)pQuery->_pResults[idx].szPeptide[i]]; + dYion += g_staticParams.massUtility.pdAAMassFragment[(int)pQuery->_pResults[idx].szPeptide[iPos]]; - if (g_staticParams.variableModParameters.bVarModSearch) - { - if (pQuery->_pResults[idx].piVarModSites[i] != 0) - dBion += pQuery->_pResults[idx].pdVarModSites[i]; + if (g_staticParams.variableModParameters.bVarModSearch) + { + if (pQuery->_pResults[idx].piVarModSites[i] != 0) + dBion += pQuery->_pResults[idx].pdVarModSites[i]; - if (pQuery->_pResults[idx].piVarModSites[iPos] != 0) - dYion += pQuery->_pResults[idx].pdVarModSites[iPos]; - } + if (pQuery->_pResults[idx].piVarModSites[iPos] != 0) + dYion += pQuery->_pResults[idx].pdVarModSites[iPos]; + } - map::iterator it; - for (int ctCharge = 1; ctCharge <= pQuery->_spectrumInfoInternal.iMaxFragCharge; ++ctCharge) + map::iterator it; + for (int ctCharge = 1; ctCharge <= pQuery->_spectrumInfoInternal.iMaxFragCharge; ++ctCharge) + { + // calculate every ion series the user specified + for (int ionSeries = 0; ionSeries < NUM_ION_SERIES; ++ionSeries) { - // calculate every ion series the user specified - for (int ionSeries = 0; ionSeries < NUM_ION_SERIES; ++ionSeries) + // skip ion series that are not enabled. + if (!g_staticParams.ionInformation.iIonVal[ionSeries]) { - // skip ion series that are not enabled. - if (!g_staticParams.ionInformation.iIonVal[ionSeries]) - { - continue; - } + continue; + } + + bool isNTerm = (ionSeries <= ION_SERIES_C); - bool isNTerm = (ionSeries <= ION_SERIES_C); + // get the fragment mass if it is n- or c-terimnus + double mass = (isNTerm) ? dBion : dYion; + int fragNumber = i + 1; - // get the fragment mass if it is n- or c-terimnus - double mass = (isNTerm) ? dBion : dYion; - int fragNumber = i + 1; + // Add any conversion factor from different ion series (e.g. b -> a, or y -> z) + mass += ionMassesRelative[ionSeries]; - // Add any conversion factor from different ion series (e.g. b -> a, or y -> z) - mass += ionMassesRelative[ionSeries]; + double mz = (mass + (ctCharge - 1) * PROTON_MASS) / ctCharge; + iTmp = BIN(mz); + if (iTmp < g_staticParams.iArraySizeGlobal && pdTmpSpectrum[iTmp] > 0.0) + { + Fragment frag; + frag.intensity = pdTmpSpectrum[iTmp]; + frag.mass = mass; + frag.type = ionSeries; + frag.number = fragNumber; + frag.charge = ctCharge; + frag.neutralLoss = false; + frag.neutralLossMass = 0.0; + eachMatchedFragments.push_back(frag); + } - double mz = (mass + (ctCharge - 1) * PROTON_MASS) / ctCharge; - iTmp = BIN(mz); - if (iTmp < g_staticParams.iArraySizeGlobal && pdTmpSpectrum[iTmp] > 0.0) + if (g_staticParams.variableModParameters.bUseFragmentNeutralLoss) + { + for (int iMod = 0; iMod < VMODS; ++iMod) { + double dNLmass = g_staticParams.variableModParameters.varModList[iMod].dNeutralLoss; + + if (dNLmass == 0.0 || g_staticParams.variableModParameters.varModList[iMod].dVarModMass == 0.0) + { + continue; // continue if this iMod entry has no mod mass or no NL mass specified + } + + if (isNTerm) + { + // if have not already come across n-term mod residue for variable mod iMod, see if position i contains the variable mod + if (!bAddNtermFragmentNeutralLoss[iMod] && pOutput[idx].piVarModSites[i] == iMod + 1) + { + bAddNtermFragmentNeutralLoss[iMod] = true; + } + } + else + { + if (!bAddCtermFragmentNeutralLoss[iMod] && pOutput[idx].piVarModSites[iPos] == iMod + 1) + { + bAddCtermFragmentNeutralLoss[iMod] = true; + } + } + + if ((isNTerm && !bAddNtermFragmentNeutralLoss[iMod]) + || (!isNTerm && !bAddCtermFragmentNeutralLoss[iMod])) + { + continue; // no fragment NL yet in peptide so continue + } + + double dNLfragMz = mz - (dNLmass / ctCharge); + iTmp = BIN(dNLfragMz); + if (iTmp < g_staticParams.iArraySizeGlobal && iTmp >= 0 && pdTmpSpectrum[iTmp] > 0.0) + { Fragment frag; frag.intensity = pdTmpSpectrum[iTmp]; - frag.mass = mass; + frag.mass = mass - dNLmass; frag.type = ionSeries; frag.number = fragNumber; frag.charge = ctCharge; - frag.neutralLoss = false; - frag.neutralLossMass = 0.0; + frag.neutralLoss = true; + frag.neutralLossMass = dNLmass; eachMatchedFragments.push_back(frag); - } - - if (g_staticParams.variableModParameters.bUseFragmentNeutralLoss) - { - for (int iMod = 0; iMod < VMODS; ++iMod) - { - double dNLmass = g_staticParams.variableModParameters.varModList[iMod].dNeutralLoss; - - if (dNLmass == 0.0 || g_staticParams.variableModParameters.varModList[iMod].dVarModMass == 0.0) - { - continue; // continue if this iMod entry has no mod mass or no NL mass specified - } - - if (isNTerm) - { - // if have not already come across n-term mod residue for variable mod iMod, see if position i contains the variable mod - if (!bAddNtermFragmentNeutralLoss[iMod] && pOutput[idx].piVarModSites[i] == iMod + 1) - { - bAddNtermFragmentNeutralLoss[iMod] = true; - } - } - else - { - if (!bAddCtermFragmentNeutralLoss[iMod] && pOutput[idx].piVarModSites[iPos] == iMod + 1) - { - bAddCtermFragmentNeutralLoss[iMod] = true; - } - } - - if ((isNTerm && !bAddNtermFragmentNeutralLoss[iMod]) - || (!isNTerm && !bAddCtermFragmentNeutralLoss[iMod])) - { - continue; // no fragment NL yet in peptide so continue - } - - double dNLfragMz = mz - (dNLmass / ctCharge); - iTmp = BIN(dNLfragMz); - if (iTmp < g_staticParams.iArraySizeGlobal && iTmp >= 0 && pdTmpSpectrum[iTmp] > 0.0) - { - Fragment frag; - frag.intensity = pdTmpSpectrum[iTmp]; - frag.mass = mass - dNLmass; - frag.type = ionSeries; - frag.number = fragNumber; - frag.charge = ctCharge; - frag.neutralLoss = true; - frag.neutralLossMass = dNLmass; - eachMatchedFragments.push_back(frag); - } - } + } } } } + } } } else { eachStrReturnPeptide = ""; // peptide eachStrReturnProtein = ""; // protein - score.xCorr = -1; // xcorr + score.xCorr = -999 ; // xcorr score.dSp = 0; // prelim score score.dExpect = 999; // E-value score.mass = 0; // calc neutral pep mass @@ -3951,8 +3952,8 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, // Deleting each Query object in the vector calls its destructor, which // frees the spectral memory (see definition for Query in CometDataInternal.h). - if (g_pvQuery.size() > 0) - delete g_pvQuery.at(0); + for (auto it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) + delete (*it); g_pvQuery.clear(); diff --git a/RealtimeSearch/Search.cs b/RealtimeSearch/Search.cs index 45157e94..ef5c9873 100644 --- a/RealtimeSearch/Search.cs +++ b/RealtimeSearch/Search.cs @@ -144,41 +144,45 @@ static void Main(string[] args) // these next variables store return value from search List vPeptide = new List(); List vProtein = new List(); - List> vMatchingFragments; - List vScores; int topN = 5; // report up to topN hits per query watch.Start(); SearchMgr.DoSingleSpectrumSearchMultiResults(topN, iPrecursorCharge, dPrecursorMZ, pdMass, pdInten, iNumPeaks, - out vPeptide, out vProtein, out vMatchingFragments, out vScores); + out vPeptide, out vProtein, out List> vMatchingFragments, out List vScores); watch.Stop(); int iProteinLengthCutoff = 30; if (vPeptide.Count > 0 && (iScanNumber % 1) == 0) { - for (int x = 0; x < vPeptide.Count; ++x) + if (vPeptide[0].Length > 0) { - string protein = vProtein[x]; - if (protein.Length > iProteinLengthCutoff) - protein = protein.Substring(0, iProteinLengthCutoff); // trim to avoid printing long protein description string - - Console.WriteLine("{0}\t{12}\t{1}\t{2}\t{3:0.0000}\t{11:0.0000}\t{10:0.0}\t{4:E4}\t{5:0.0000}\t{6:0.0000}\t{7}\t{8}\t{9}", - iScanNumber, vPeptide[x], protein, vScores[x].xCorr, vScores[x].dExpect, dExpPepMass - 1.00727646688, vScores[x].mass, - vScores[x].MatchedIons, vScores[x].TotalIons, iPass, vScores[x].dSp, vScores[x].dCn, x + 1); -/* - foreach (var myFragment in vMatchingFragments[x]) // print matched fragment ions + for (int x = 0; x < vPeptide.Count; ++x) { - Console.WriteLine("\t{0:0.0000}\t{1:0.0}\t{2}+\t{3}-ion", - myFragment.Mass, - myFragment.Intensity, - myFragment.Charge, - myFragment.Type); - } + if (vPeptide[x].Length > 0) + { + string protein = vProtein[x]; + if (protein.Length > iProteinLengthCutoff) + protein = protein.Substring(0, iProteinLengthCutoff); // trim to avoid printing long protein description string + + Console.WriteLine("{0}\t{12}\t{1}\t{2}\t{3:0.0000}\t{11:0.0000}\t{10:0.0}\t{4:E4}\t{5:0.0000}\t{6:0.0000}\t{7}\t{8}\t{9}", + iScanNumber, vPeptide[x], protein, vScores[x].xCorr, vScores[x].dExpect, dExpPepMass - 1.00727646688, vScores[x].mass, + vScores[x].MatchedIons, vScores[x].TotalIons, iPass, vScores[x].dSp, vScores[x].dCn, x + 1); +/* + foreach (var myFragment in vMatchingFragments[x]) // print matched fragment ions + { + Console.WriteLine("\t{0:0.0000}\t{1:0.0}\t{2}+\t{3}-ion", + myFragment.Mass, + myFragment.Intensity, + myFragment.Charge, + myFragment.Type); + } */ + } + } + Console.WriteLine(""); } - Console.WriteLine(""); } if (vPeptide.Count > 0) @@ -238,8 +242,8 @@ public bool ConfigureInputSettings(CometSearchManagerWrapper SearchMgr, String sTmp; int iTmp; double dTmp; - DoubleRangeWrapper doubleRangeParam = new DoubleRangeWrapper(); - IntRangeWrapper intRangeParam = new IntRangeWrapper(); +// DoubleRangeWrapper doubleRangeParam = new DoubleRangeWrapper(); +// IntRangeWrapper intRangeParam = new IntRangeWrapper(); SearchMgr.SetParam("database_name", sDB, sDB); From 54f1ff1a45201c9e66103407fc4647bf6e33779b Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Tue, 31 Dec 2024 10:49:24 -0800 Subject: [PATCH 17/18] Skip some peptide index parsing on subsequent calls to DoSingleSpectrumSearchMultiResults(). --- CometSearch/CometDataInternal.h | 1 + CometSearch/CometMassSpecUtils.cpp | 9 +- CometSearch/CometSearch.cpp | 263 +++++++++++++++-------------- CometSearch/CometSearchManager.cpp | 7 - 4 files changed, 144 insertions(+), 136 deletions(-) diff --git a/CometSearch/CometDataInternal.h b/CometSearch/CometDataInternal.h index 0ebb3736..f1d9d325 100644 --- a/CometSearch/CometDataInternal.h +++ b/CometSearch/CometDataInternal.h @@ -976,6 +976,7 @@ extern int* PEPTIDE_MOD_SEQ_IDXS; extern int MOD_NUM; extern bool g_bPlainPeptideIndexRead; // set to true if plain peptide index file is read (and fragment index generated) + // poor choice of name for the fragment index .idx given peptide index is back extern bool g_bPeptideIndexRead; // set to true if peptide index file is read // Query stores information for peptide scoring and results diff --git a/CometSearch/CometMassSpecUtils.cpp b/CometSearch/CometMassSpecUtils.cpp index 82931e0c..952be7cb 100644 --- a/CometSearch/CometMassSpecUtils.cpp +++ b/CometSearch/CometMassSpecUtils.cpp @@ -385,11 +385,18 @@ void CometMassSpecUtils::GetPrevNextAA(FILE *fpfasta, } } - CometSearch cs; + if (strSeq.size() < 1) + { + printf("Error - parsed sequence in GetPrevNextAA() is empty. File pointer %ld, query %d, result %d.\n", *it, iWhichQuery, iWhichResult); + pOutput[iWhichResult].cPrevAA = pOutput[iWhichResult].cNextAA = '-'; + return; + } char* szSequence = (char*)malloc(strSeq.size() + 1); strcpy(szSequence, strSeq.c_str()); int iLenSequence = (int)strlen(szSequence); + + CometSearch cs; cs._proteinInfo.iTmpProteinSeqLength = iLenSequence; // used in CheckEnzymeTermini if (iWhichTerm == 0) diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index 5cbf5e1b..035a72f2 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -357,7 +357,7 @@ bool CometSearch::RunSearch(int iPercentStart, if (iNumBadChars > 20) { logerr(" Too many non-printing characters in database header lines; wrong file type/format?\n"); - fclose(fp); + std::fclose(fp); return false; } } @@ -385,7 +385,7 @@ bool CometSearch::RunSearch(int iPercentStart, string strErrorMsg = " Error realloc(szPeffLine[" + to_string(iLenSzLine) + "])\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg.c_str()); - fclose(fp); + std::fclose(fp); return false; } szPeffLine = pTmp; @@ -428,7 +428,7 @@ bool CometSearch::RunSearch(int iPercentStart, string strErrorMsg = " Error realloc(szMods[" + to_string(iLenAllocMods) + "])\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg.c_str()); - fclose(fp); + std::fclose(fp); return false; } szMods = pTmp; @@ -447,7 +447,7 @@ bool CometSearch::RunSearch(int iPercentStart, string strErrorMsg = " Error: PEFF entry '" + dbe.strName + "' missing mod closing parenthesis\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg.c_str()); - fclose(fp); + std::fclose(fp); return false; } @@ -566,7 +566,7 @@ bool CometSearch::RunSearch(int iPercentStart, string strErrorMsg = " Error realloc(szMods[" + to_string(iLenAllocMods) + "])\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg.c_str()); - fclose(fp); + std::fclose(fp); return false; } szMods = pTmp; @@ -585,7 +585,7 @@ bool CometSearch::RunSearch(int iPercentStart, string strErrorMsg = " Error: PEFF entry '" + dbe.strName + "' missing variant closing parenthesis\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg.c_str()); - fclose(fp); + std::fclose(fp); return false; } @@ -679,7 +679,7 @@ bool CometSearch::RunSearch(int iPercentStart, string strErrorMsg = " Error realloc(szMods[" + to_string(iLenAllocMods) + "])\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg.c_str()); - fclose(fp); + std::fclose(fp); return false; } szMods = pTmp; @@ -698,7 +698,7 @@ bool CometSearch::RunSearch(int iPercentStart, string strErrorMsg = " Error: PEFF entry '" + dbe.strName + "' missing variant closing parenthesis\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg.c_str()); - fclose(fp); + std::fclose(fp); return false; } @@ -856,7 +856,7 @@ bool CometSearch::RunSearch(int iPercentStart, bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); } - fclose(fp); + std::fclose(fp); if (!g_staticParams.options.bOutputSqtStream) { @@ -946,7 +946,7 @@ void CometSearch::ReadOBO(char *szOBO, } } - fclose(fp); + std::fclose(fp); } else { @@ -1588,155 +1588,162 @@ bool CometSearch::SearchPeptideIndex(void) return false; } - // ignore any static masses in params file; only valid ones - // are those in database index - memset(g_staticParams.staticModifications.pdStaticMods, 0, sizeof(g_staticParams.staticModifications.pdStaticMods)); - - bool bFoundStatic = false; - bool bFoundVariable = false; - - // read in static and variable mods - while (fgets(szBuf, sizeof(szBuf), fp)) + if (!g_bPeptideIndexRead) // save some repeated parsing when being called from DLL { - if (!strncmp(szBuf, "MassType:", 9)) - { - sscanf(szBuf, "%d %d", &g_staticParams.massUtility.bMonoMassesParent, &g_staticParams.massUtility.bMonoMassesFragment); - } - else if (!strncmp(szBuf, "StaticMod:", 10)) - { - char *tok; - char delims[] = " "; - int x=65; + // ignore any static masses in params file; only valid ones + // are those in database index + memset(g_staticParams.staticModifications.pdStaticMods, 0, sizeof(g_staticParams.staticModifications.pdStaticMods)); - // FIX: hack here for setting static mods; need to reset masses ... fix later - CometMassSpecUtils::AssignMass(g_staticParams.massUtility.pdAAMassFragment, - g_staticParams.massUtility.bMonoMassesFragment, - &g_staticParams.massUtility.dOH2fragment); + bool bFoundStatic = false; + bool bFoundVariable = false; - bFoundStatic = true; - tok=strtok(szBuf+11, delims); - while (tok != NULL) + // read in static and variable mods + while (fgets(szBuf, sizeof(szBuf), fp)) + { + if (!strncmp(szBuf, "MassType:", 9)) { - sscanf(tok, "%lf", &(g_staticParams.staticModifications.pdStaticMods[x])); - g_staticParams.massUtility.pdAAMassFragment[x] += g_staticParams.staticModifications.pdStaticMods[x]; - tok = strtok(NULL, delims); - x++; - if (x==95) // 65-90 stores A-Z then next 4 (ascii 91-94) are n/c term peptide, n/c term protein - break; + sscanf(szBuf, "%d %d", &g_staticParams.massUtility.bMonoMassesParent, &g_staticParams.massUtility.bMonoMassesFragment); } + else if (!strncmp(szBuf, "StaticMod:", 10)) + { + char* tok; + char delims[] = " "; + int x = 65; + + // FIX: hack here for setting static mods; need to reset masses ... fix later + CometMassSpecUtils::AssignMass(g_staticParams.massUtility.pdAAMassFragment, + g_staticParams.massUtility.bMonoMassesFragment, + &g_staticParams.massUtility.dOH2fragment); + + bFoundStatic = true; + tok = strtok(szBuf + 11, delims); + while (tok != NULL) + { + sscanf(tok, "%lf", &(g_staticParams.staticModifications.pdStaticMods[x])); + g_staticParams.massUtility.pdAAMassFragment[x] += g_staticParams.staticModifications.pdStaticMods[x]; + tok = strtok(NULL, delims); + x++; + if (x == 95) // 65-90 stores A-Z then next 4 (ascii 91-94) are n/c term peptide, n/c term protein + break; + } - g_staticParams.staticModifications.dAddNterminusPeptide = g_staticParams.staticModifications.pdStaticMods[91]; - g_staticParams.staticModifications.dAddCterminusPeptide = g_staticParams.staticModifications.pdStaticMods[92]; - g_staticParams.staticModifications.dAddNterminusProtein = g_staticParams.staticModifications.pdStaticMods[93]; - g_staticParams.staticModifications.dAddCterminusProtein = g_staticParams.staticModifications.pdStaticMods[94]; + g_staticParams.staticModifications.dAddNterminusPeptide = g_staticParams.staticModifications.pdStaticMods[91]; + g_staticParams.staticModifications.dAddCterminusPeptide = g_staticParams.staticModifications.pdStaticMods[92]; + g_staticParams.staticModifications.dAddNterminusProtein = g_staticParams.staticModifications.pdStaticMods[93]; + g_staticParams.staticModifications.dAddCterminusProtein = g_staticParams.staticModifications.pdStaticMods[94]; - // have to set these here again once static mods are read - g_staticParams.precalcMasses.dNtermProton = g_staticParams.staticModifications.dAddNterminusPeptide - + PROTON_MASS; + // have to set these here again once static mods are read + g_staticParams.precalcMasses.dNtermProton = g_staticParams.staticModifications.dAddNterminusPeptide + + PROTON_MASS; - g_staticParams.precalcMasses.dCtermOH2Proton = g_staticParams.staticModifications.dAddCterminusPeptide - + g_staticParams.massUtility.dOH2fragment - + PROTON_MASS; + g_staticParams.precalcMasses.dCtermOH2Proton = g_staticParams.staticModifications.dAddCterminusPeptide + + g_staticParams.massUtility.dOH2fragment + + PROTON_MASS; - g_staticParams.precalcMasses.dOH2ProtonCtermNterm = g_staticParams.massUtility.dOH2parent - + PROTON_MASS - + g_staticParams.staticModifications.dAddCterminusPeptide - + g_staticParams.staticModifications.dAddNterminusPeptide; - } - else if (!strncmp(szBuf, "Enzyme:", 7)) - { - sscanf(szBuf, "Enzyme: %s [%d %s %s]", g_staticParams.enzymeInformation.szSearchEnzymeName, - &(g_staticParams.enzymeInformation.iSearchEnzymeOffSet), - g_staticParams.enzymeInformation.szSearchEnzymeBreakAA, - g_staticParams.enzymeInformation.szSearchEnzymeNoBreakAA); - } - else if (!strncmp(szBuf, "Enzyme2:", 8)) - { - sscanf(szBuf, "Enzyme2: %s [%d %s %s]", g_staticParams.enzymeInformation.szSearchEnzyme2Name, - &(g_staticParams.enzymeInformation.iSearchEnzyme2OffSet), - g_staticParams.enzymeInformation.szSearchEnzyme2BreakAA, - g_staticParams.enzymeInformation.szSearchEnzyme2NoBreakAA); - } - else if (!strncmp(szBuf, "VariableMod:", 12)) - { - char *tok; - char delims[] = " "; - int x=0; + g_staticParams.precalcMasses.dOH2ProtonCtermNterm = g_staticParams.massUtility.dOH2parent + + PROTON_MASS + + g_staticParams.staticModifications.dAddCterminusPeptide + + g_staticParams.staticModifications.dAddNterminusPeptide; + } + else if (!strncmp(szBuf, "Enzyme:", 7)) + { + sscanf(szBuf, "Enzyme: %s [%d %s %s]", g_staticParams.enzymeInformation.szSearchEnzymeName, + &(g_staticParams.enzymeInformation.iSearchEnzymeOffSet), + g_staticParams.enzymeInformation.szSearchEnzymeBreakAA, + g_staticParams.enzymeInformation.szSearchEnzymeNoBreakAA); + } + else if (!strncmp(szBuf, "Enzyme2:", 8)) + { + sscanf(szBuf, "Enzyme2: %s [%d %s %s]", g_staticParams.enzymeInformation.szSearchEnzyme2Name, + &(g_staticParams.enzymeInformation.iSearchEnzyme2OffSet), + g_staticParams.enzymeInformation.szSearchEnzyme2BreakAA, + g_staticParams.enzymeInformation.szSearchEnzyme2NoBreakAA); + } + else if (!strncmp(szBuf, "VariableMod:", 12)) + { + char* tok; + char delims[] = " "; + int x = 0; - bFoundVariable = true; + bFoundVariable = true; - tok=strtok(szBuf+13, delims); - while (tok != NULL) - { - tok = strtok(NULL, delims); // skip list of var mod residues + tok = strtok(szBuf + 13, delims); + while (tok != NULL) + { + tok = strtok(NULL, delims); // skip list of var mod residues - // for index search, storing variable mods 0-9 in pdStaticMods array 0-9 - sscanf(tok, "%lf:%lf", &(g_staticParams.variableModParameters.varModList[x].dVarModMass), + // for index search, storing variable mods 0-9 in pdStaticMods array 0-9 + sscanf(tok, "%lf:%lf", &(g_staticParams.variableModParameters.varModList[x].dVarModMass), &(g_staticParams.variableModParameters.varModList[x].dNeutralLoss)); - if (g_staticParams.variableModParameters.varModList[x].dNeutralLoss != 0.0) - g_staticParams.variableModParameters.bUseFragmentNeutralLoss = true; + if (g_staticParams.variableModParameters.varModList[x].dNeutralLoss != 0.0) + g_staticParams.variableModParameters.bUseFragmentNeutralLoss = true; - tok = strtok(NULL, delims); + tok = strtok(NULL, delims); - x++; - if (x == VMODS) - break; + x++; + if (x == VMODS) + break; + } + break; } - break; } - } - if (!(bFoundStatic && bFoundVariable)) - { - char szErr[256]; - sprintf(szErr, " Error with index database format. Mods not parsed (%d %d).", bFoundStatic, bFoundVariable); - logerr(szErr); - fclose(fp); - return false; - } + if (!(bFoundStatic && bFoundVariable)) + { + char szErr[256]; + sprintf(szErr, " Error with index database format. Mods not parsed (%d %d).", bFoundStatic, bFoundVariable); + logerr(szErr); + std::fclose(fp); + return false; + } - // indexed searches will always set this to true - g_staticParams.variableModParameters.bVarModSearch = true; + // indexed searches will always set this to true + g_staticParams.variableModParameters.bVarModSearch = true; + } // read fp of index comet_fileoffset_t clTmp; comet_fileoffset_t clProteinsFilePos; - comet_fseek(fp, -clSizeCometFileOffset*2, SEEK_END); + comet_fseek(fp, -clSizeCometFileOffset * 2, SEEK_END); tTmp = fread(&lEndOfStruct, clSizeCometFileOffset, 1, fp); tTmp = fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); - // now read in: vector> g_pvProteinsList - comet_fseek(fp, clProteinsFilePos, SEEK_SET); - size_t tSize; - tTmp = fread(&tSize, clSizeCometFileOffset, 1, fp); - vector vTmp; - - g_pvProteinsList.clear(); - g_pvProteinsList.reserve(tSize); - for (size_t it = 0; it < tSize; ++it) + if (!g_bPeptideIndexRead) { - size_t tNumProteinOffsets; - tTmp = fread(&tNumProteinOffsets, clSizeCometFileOffset, 1, fp); + // now read in: vector> g_pvProteinsList + comet_fseek(fp, clProteinsFilePos, SEEK_SET); + size_t tSize; + tTmp = fread(&tSize, clSizeCometFileOffset, 1, fp); + vector vTmp; - vTmp.clear(); - for (size_t it2 = 0; it2 < tNumProteinOffsets; ++it2) + g_pvProteinsList.clear(); + g_pvProteinsList.reserve(tSize); + for (size_t it = 0; it < tSize; ++it) { - tTmp = fread(&clTmp, clSizeCometFileOffset, 1, fp); - vTmp.push_back(clTmp); + size_t tNumProteinOffsets; + tTmp = fread(&tNumProteinOffsets, clSizeCometFileOffset, 1, fp); + + vTmp.clear(); + for (size_t it2 = 0; it2 < tNumProteinOffsets; ++it2) + { + tTmp = fread(&clTmp, clSizeCometFileOffset, 1, fp); + vTmp.push_back(clTmp); + } + g_pvProteinsList.push_back(vTmp); } - g_pvProteinsList.push_back(vTmp); + g_bPeptideIndexRead = true; } + // read index + int iMinMass = 0; + int iMaxMass = 0; + uint64_t tNumPeptides = 0; + // seek to index comet_fseek(fp, lEndOfStruct, SEEK_SET); - // read index - int iMinMass=0; - int iMaxMass=0; - uint64_t tNumPeptides=0; - tTmp = fread(&iMinMass, sizeof(int), 1, fp); tTmp = fread(&iMaxMass, sizeof(int), 1, fp); tTmp = fread(&tNumPeptides, sizeof(uint64_t), 1, fp); @@ -1747,13 +1754,13 @@ bool CometSearch::SearchPeptideIndex(void) char szErr[256]; sprintf(szErr, " Error reading .idx database: min mass %d, max mass %d, num peptides %zu\n", iMinMass, iMaxMass, tNumPeptides); logerr(szErr); - fclose(fp); + std::fclose(fp); return false; } int iMaxPeptideMass10 = iMaxMass * 10; - comet_fileoffset_t *lReadIndex = new comet_fileoffset_t[iMaxPeptideMass10]; - for (int i=0; i< iMaxPeptideMass10; i++) + comet_fileoffset_t* lReadIndex = new comet_fileoffset_t[iMaxPeptideMass10]; + for (int i = 0; i < iMaxPeptideMass10; ++i) lReadIndex[i] = -1; tTmp = fread(lReadIndex, sizeof(comet_fileoffset_t), iMaxPeptideMass10, fp); @@ -1764,7 +1771,7 @@ bool CometSearch::SearchPeptideIndex(void) if (iStart > iMaxMass) // smallest input mass is greater than what's stored in index { delete[] lReadIndex; - fclose(fp); + std::fclose(fp); return true; } @@ -1790,7 +1797,7 @@ bool CometSearch::SearchPeptideIndex(void) if (lReadIndex[iStart10] == -1) // no match found within tolerance { delete[] lReadIndex; - fclose(fp); + std::fclose(fp); return true; } @@ -1897,7 +1904,7 @@ bool CometSearch::SearchPeptideIndex(void) } delete [] lReadIndex; - fclose(fp); + std::fclose(fp); return true; } diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index d8bcdbb2..ebf98db8 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -3215,13 +3215,6 @@ bool CometSearchManager::InitializeSingleSpectrumSearch() sqSearch.CreateFragmentIndex(tp); } -/* FIX: need to add this functionality; how to specify PeptideIndex though? - else if (g_staticParams.iIndexDb == 2 && !g_PeptideIndexRead) - { - sqSearch.CreatePeptideIndex(tp); - } -*/ - singleSearchInitializationComplete = true; return true; From 074d38e4ec40843f0ef80455bb8dbd47db9c21eb Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Tue, 31 Dec 2024 11:00:02 -0800 Subject: [PATCH 18/18] Update Makefile remove extra '-' in "--std=c++14" option --- CometSearch/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 408b458c..4af7032a 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,9 +14,9 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -static --std=c++14 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static -std=c++14 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 else - override CXXFLAGS += -O3 -static --std=c++14 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 + override CXXFLAGS += -O3 -static -std=c++14 -fpermissive -Wall -Wextra -Wno-write-strings -DGITHUBSHA='"$(GITHUB_SHA)"' -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 endif