add DataSetFDR

not tested, don't use it yet
cctsou · Mar 14, 2016 · 3d5472f · 3d5472f
1 parent 62eeb34
commit 3d5472f
Show file tree

Hide file tree

Showing 8 changed files with 144 additions and 62 deletions.
diff --git a/DIA-Umpire/src/FDREstimator/FDR_DataSetLevel.java b/DIA-Umpire/src/FDREstimator/FDR_DataSetLevel.java
@@ -0,0 +1,57 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package FDREstimator;
+
+import MSUmpire.BaseDataStructure.DBSearchParam;
+import MSUmpire.DIA.DIAPack;
+import MSUmpire.PSMDataStructure.LCMSID;
+import MSUmpire.PSMDataStructure.PepIonID;
+import java.io.IOException;
+import java.util.ArrayList;
+import javax.xml.parsers.ParserConfigurationException;
+import org.xml.sax.SAXException;
+import org.xmlpull.v1.XmlPullParserException;
+
+/**
+ *
+ * @author Chih-Chiang Tsou <[email protected]>
+ */
+public class FDR_DataSetLevel {
+
+    public LCMSID combineID = null;
+
+    public void GeneratePepIonList(ArrayList<DIAPack> DIAFileList, DBSearchParam param, String combineIDPath) throws IOException, ParserConfigurationException, SAXException, XmlPullParserException, ClassNotFoundException, InterruptedException {
+
+        for (DIAPack diafile : DIAFileList) {
+            diafile.ParsePepXML(param,null);
+        }
+
+        //Estimate peptide level PepFDR in whole dataset
+        combineID = new LCMSID(combineIDPath, param.DecoyPrefix, param.FastaPath);
+        for (DIAPack Diafile : DIAFileList) {
+            LCMSID lcms = Diafile.IDsummary;
+            for (PepIonID pepIonID : lcms.GetPepIonList().values()) {
+                if (!combineID.GetPepIonList().containsKey(pepIonID.GetKey())) {
+                    PepIonID newpep = pepIonID.ClonePepIonID();
+                    if (pepIonID.IsDecoy(param.DecoyPrefix)) {
+                        newpep.IsDecoy = 1;
+                    } else {
+                        newpep.IsDecoy = 0;
+                    }
+                    combineID.AddPeptideID(newpep);
+                }
+                if (combineID.GetPepIonList().get(pepIonID.GetKey()).MaxProbability < pepIonID.MaxProbability) {
+                    combineID.GetPepIonList().get(pepIonID.GetKey()).MaxProbability = pepIonID.MaxProbability;
+                }
+            }
+        }
+        combineID.DecoyTag = param.DecoyPrefix;
+        combineID.FDR = param.PepFDR;
+        combineID.FindPepProbThresholdByFDR();
+        combineID.RemoveDecoyPep();
+        combineID.RemoveLowProbPep();
+    }
+}
diff --git a/DIA-Umpire/src/MSUmpire/DIA/DIAPack.java b/DIA-Umpire/src/MSUmpire/DIA/DIAPack.java
@@ -638,17 +638,25 @@ public void SetPepXMLPath(){
         iProphPepXMLs.add(PepXMLPath3);
     }
 
-    public void ParsePepXML(DBSearchParam searchPara) throws ParserConfigurationException, SAXException, IOException, XmlPullParserException, ClassNotFoundException, InterruptedException {
+    public void ParsePepXML(DBSearchParam searchPara, LCMSID refID) throws ParserConfigurationException, SAXException, IOException, XmlPullParserException, ClassNotFoundException, InterruptedException {
 
         SetPepXMLPath();
         IDsummary = new LCMSID(FilenameUtils.getFullPath(Filename) + FilenameUtils.getBaseName(Filename),searchPara.DecoyPrefix,searchPara.FastaPath);        
         for (String pepxml : iProphPepXMLs) {
             LCMSID pepxmlid = new LCMSID(FilenameUtils.getFullPath(Filename) + FilenameUtils.getBaseName(Filename),searchPara.DecoyPrefix,searchPara.FastaPath);
             PepXMLParser pepxmlparser = new PepXMLParser(pepxmlid, pepxml, 0f);
-            pepxmlid.FilterByPepDecoyFDR(searchPara.DecoyPrefix, searchPara.PepFDR);
+            if (refID == null) {
+                pepxmlid.FilterByPepDecoyFDR(searchPara.DecoyPrefix, searchPara.PepFDR);
+            }
             Logger.getRootLogger().info("No. of peptide ions:" + pepxmlid.GetPepIonList().size() + "; Peptide level threshold: " + pepxmlid.PepProbThreshold);
             for (PepIonID pepID : pepxmlid.GetPepIonList().values()) {
-                IDsummary.AddPeptideID(pepID);
+                if (refID != null) {
+                    if(refID.GetPepIonList().containsKey(pepID.GetKey())){
+                        IDsummary.AddPeptideID(pepID);
+                    }                    
+                } else {
+                    IDsummary.AddPeptideID(pepID);
+                }
             }
         }
         IDsummary.ReMapProPep();

diff --git a/DIA-Umpire/src/MSUmpire/MathPackage/ChiSquareGOF.java b/DIA-Umpire/src/MSUmpire/MathPackage/ChiSquareGOF.java
@@ -34,14 +34,14 @@ public class ChiSquareGOF {
     public static ReadWriteLock lock = new ReentrantReadWriteLock();    
 
     private ChiSquareGOF(int maxpeak) {        
-        chimodels = new ChiSquared[maxpeak];
-        for (int i = 2; i <= maxpeak; i++) {
-            chimodels[i - 1] = new ChiSquared(i - 1);
+        chimodels = new ChiSquared[maxpeak-1];
+        for (int i = 1; i <= maxpeak; i++) {
+            chimodels[i - 1] = new ChiSquared(i);
         }
     }
 
     public static ChiSquareGOF GetInstance(int maxpeak) {
-        if (models == null || maxpeak > chimodels.length) {
+        if (models == null || (maxpeak>1 && maxpeak >= chimodels.length)) {
             lock.writeLock().lock();
             try {
                 if (models == null) {
@@ -57,17 +57,17 @@ public static ChiSquareGOF GetInstance(int maxpeak) {
     public float GetGoodNessOfFitProb(float[] expected, float[] observed) {
         float gof = 0f;
         int nopeaks = 0;
-        for (int i = 0; i < expected.length; i++) {
+        for (int i = 0; i < Math.min(observed.length,expected.length); i++) {
             if (observed[i] > 0) {
                 float error = expected[i] - observed[i];
                 gof += (error * error) / (expected[i] * expected[i]);
                 nopeaks++;
             }
         }
-        if (Float.isNaN(gof)) {
+        if (Float.isNaN(gof) || nopeaks<2){
             return 0f;
         }
-        float prob = 1 - (float) chimodels[nopeaks].cdf(gof);
+        float prob = 1 - (float) chimodels[nopeaks-2].cdf(gof);
         return prob;
     }
 }
diff --git a/DIA-Umpire/src/MSUmpire/PeakDataStructure/PeakCluster.java b/DIA-Umpire/src/MSUmpire/PeakDataStructure/PeakCluster.java
@@ -217,7 +217,7 @@ public void CalcPeakArea_V2() {
         startRT = MonoIsotopePeak.StartRT();
         endRT = MonoIsotopePeak.EndRT();
 
-        if (IsoPeaksCurves[1]!=null) {
+        if (IsoPeaksCurves.length>1 && IsoPeaksCurves[1]!=null) {
             startRT = Math.min(MonoIsotopePeak.StartRT(), IsoPeaksCurves[1].StartRT());
             endRT = Math.max(MonoIsotopePeak.EndRT(), IsoPeaksCurves[1].EndRT());
         }

diff --git a/DIA-Umpire/src/MSUmpire/PeptidePeakClusterDetection/PDHandlerBase.java b/DIA-Umpire/src/MSUmpire/PeptidePeakClusterDetection/PDHandlerBase.java
@@ -370,15 +370,15 @@ protected void ReadPepIsoMS1PatternMap() throws FileNotFoundException, IOExcepti
 
         InputStream is = this.getClass().getClassLoader().getResourceAsStream("resource/IsotopicPatternRange.csv");
         BufferedReader reader = new BufferedReader(new InputStreamReader(is));
-        IsotopePatternMap = new TreeMap[LCMSPeakBase.MaxNoPeakCluster - 1];
+        IsotopePatternMap = new TreeMap[Math.max(2,LCMSPeakBase.MaxNoPeakCluster - 1)];
         for (int i = 0; i < IsotopePatternMap.length; i++) {
             IsotopePatternMap[i] = new TreeMap<>();
         }
         String line = "";
         while ((line = reader.readLine()) != null) {
             float MW = Float.parseFloat(line.split(",")[0]);
 
-            for (int i = 0; i < LCMSPeakBase.MaxNoPeakCluster - 1; i++) {
+            for (int i = 0; i < IsotopePatternMap.length; i++) {
                 float Mean = Float.parseFloat(line.split(",")[1 + (i * 2)]);
                 float SD = Float.parseFloat(line.split(",")[2 + (i * 2)]);
 

diff --git a/DIA_Umpire_LCMSIDGen/src/dia_umpire_quant/DIA_Umpire_LCMSIDGen.java b/DIA_Umpire_LCMSIDGen/src/dia_umpire_quant/DIA_Umpire_LCMSIDGen.java
@@ -180,7 +180,7 @@ public static void main(String[] args) throws FileNotFoundException, IOException
                 }
                 Logger.getRootLogger().info("Loading identification results " + mzXMLFile + "....");
 
-                DiaFile.ParsePepXML(tandemPara);
+                DiaFile.ParsePepXML(tandemPara,null);
                 DiaFile.BuildStructure();
                 if (!DiaFile.MS1FeatureMap.ReadPeakCluster()) {
                     Logger.getRootLogger().info("Loading peak and structure failed, job is incomplete");

diff --git a/DIA_Umpire_Quant/src/dia_umpire_quant/DIA_Umpire_Quant.java b/DIA_Umpire_Quant/src/dia_umpire_quant/DIA_Umpire_Quant.java
@@ -24,6 +24,7 @@
  * To change this template file, choose Tools | Templates
  * and open the template in the editor.
  */
+import FDREstimator.FDR_DataSetLevel;
 import MSUmpire.BaseDataStructure.UmpireInfo;
 import MSUmpire.DIA.DIAPack;
 import MSUmpire.DIA.RTAlignedPepIonMapping;
@@ -94,7 +95,7 @@ public static void main(String[] args) throws FileNotFoundException, IOException
         String ExternalLibPath = "";
         String ExternalLibDecoyTag = "DECOY";
         boolean DefaultProtFiltering=true;
-
+        boolean DataSetLevelPepFDR=false;
         float ProbThreshold = 0.99f;
         float ExtProbThreshold =0.99f;
         float Freq = 0f;
@@ -202,6 +203,10 @@ public static void main(String[] args) throws FileNotFoundException, IOException
                         tandemPara.PepFDR = Float.parseFloat(value);
                         break;
                     }
+                    case "DataSetLevelPepFDR": {
+                        DataSetLevelPepFDR = Boolean.parseBoolean(value);
+                        break;
+                    }
                     case "InternalLibID": {
                         InternalLibID = value;
                         break;
@@ -332,6 +337,8 @@ public static void main(String[] args) throws FileNotFoundException, IOException
         if (!new File(Combined_Prot).exists()) {
             Logger.getRootLogger().info("ProtXML file: " +Combined_Prot + " cannot be found, the export protein summary table will be empty.");
         }
+
+
         LCMSID protID = null;
 
         //Parse prot.xml and generate protein master list given an FDR 
@@ -388,10 +395,65 @@ public static void main(String[] args) throws FileNotFoundException, IOException
             for (File fileEntry : AssignFiles.values()) {
                 Logger.getRootLogger().info(fileEntry.getAbsolutePath());
             }
+
+
+        LCMSID combinePepID = null;
+        if (DataSetLevelPepFDR) {
+            combinePepID = LCMSID.ReadLCMSIDSerialization(WorkFolder + "combinePepID.SerFS");
+            if (combinePepID == null) {
+                FDR_DataSetLevel fdr = new FDR_DataSetLevel();
+                fdr.GeneratePepIonList(FileList, tandemPara, WorkFolder + "combinePepID.SerFS");
+                combinePepID = fdr.combineID;
+                combinePepID.WriteLCMSIDSerialization(WorkFolder + "combinePepID.SerFS");
+            }
+        }
 
             //process each DIA file for quantification based on untargeted identifications
             for (File fileEntry : AssignFiles.values()) {
-                ProcessDIA(fileEntry, NoCPUs, tandemPara, FileList, IDSummaryFragments);
+                String mzXMLFile = fileEntry.getAbsolutePath();
+                if (mzXMLFile.toLowerCase().endsWith(".mzxml") | mzXMLFile.toLowerCase().endsWith(".mzml")) {
+                    long time = System.currentTimeMillis();
+
+                    DIAPack DiaFile = new DIAPack(mzXMLFile, NoCPUs);
+                    FileList.add(DiaFile);
+                    HashMap<String, FragmentPeak> FragMap = new HashMap<>();
+                    IDSummaryFragments.put(FilenameUtils.getBaseName(mzXMLFile), FragMap);
+                    if (!new File(FilenameUtils.getFullPath(DiaFile.Filename) + DiaFile.GetQ1Name() + ".mzXML").exists()
+                            | !new File(FilenameUtils.getFullPath(DiaFile.Filename) + DiaFile.GetQ2Name() + ".mzXML").exists()
+                            | !new File(FilenameUtils.getFullPath(DiaFile.Filename) + DiaFile.GetQ3Name() + ".mzXML").exists()) {
+                        return;
+                    }
+                    Logger.getRootLogger().info("=================================================================================================");
+                    Logger.getRootLogger().info("Processing " + mzXMLFile);
+                    if (!DiaFile.LoadDIASetting()) {
+                        Logger.getRootLogger().info("Loading DIA setting failed, job is incomplete");
+                        System.exit(1);
+                    }
+                    if (!DiaFile.LoadParams()) {
+                        Logger.getRootLogger().info("Loading parameters failed, job is incomplete");
+                        System.exit(1);
+                    }
+                    Logger.getRootLogger().info("Loading identification results " + mzXMLFile + "....");
+
+                    //If the LCMSID serialization is found
+                    if (!DiaFile.ReadSerializedLCMSID()) {
+                        DiaFile.ParsePepXML(tandemPara, combinePepID);
+                        DiaFile.BuildStructure();
+                        if (!DiaFile.MS1FeatureMap.ReadPeakCluster()) {
+                            Logger.getRootLogger().info("Loading peak and structure failed, job is incomplete");
+                            System.exit(1);
+                        }
+                        DiaFile.MS1FeatureMap.ClearMonoisotopicPeakOfCluster();
+                        //Generate mapping between index of precursor feature and pseudo MS/MS scan index 
+                        DiaFile.GenerateClusterScanNomapping();
+                        //Doing quantification
+                        DiaFile.AssignQuant();
+                        DiaFile.ClearStructure();
+                    }
+                    DiaFile.IDsummary.ReduceMemoryUsage();
+                    time = System.currentTimeMillis() - time;
+                    Logger.getRootLogger().info(mzXMLFile + " processed time:" + String.format("%d hour, %d min, %d sec", TimeUnit.MILLISECONDS.toHours(time), TimeUnit.MILLISECONDS.toMinutes(time) - TimeUnit.HOURS.toMinutes(TimeUnit.MILLISECONDS.toHours(time)), TimeUnit.MILLISECONDS.toSeconds(time) - TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(time))));
+                }
             }
 
             //<editor-fold defaultstate="collapsed" desc="Targete re-extraction using internal library">            
@@ -622,51 +684,5 @@ private static void SaintOutput(LCMSID protID, LCMSID IDsummary, FragmentSelecti
         }
     }
 
-    private static void ProcessDIA(final File fileEntry, int NoCPUs, TandemParam tandemPara, ArrayList<DIAPack> FileList, HashMap<String, HashMap<String, FragmentPeak>> IDSummaryFragments) throws IOException, FileNotFoundException, DataFormatException, InterruptedException, ExecutionException, ParserConfigurationException, SAXException, Exception {
-        String mzXMLFile = fileEntry.getAbsolutePath();
-        if (mzXMLFile.toLowerCase().endsWith(".mzxml") | mzXMLFile.toLowerCase().endsWith(".mzml")) {
-            long time = System.currentTimeMillis();
-
-            DIAPack DiaFile = new DIAPack(mzXMLFile, NoCPUs);
-            FileList.add(DiaFile);
-            HashMap<String, FragmentPeak> FragMap = new HashMap<>();
-            IDSummaryFragments.put(FilenameUtils.getBaseName(mzXMLFile), FragMap);
-            if (!new File(FilenameUtils.getFullPath(DiaFile.Filename) + DiaFile.GetQ1Name() + ".mzXML").exists()
-                    | !new File(FilenameUtils.getFullPath(DiaFile.Filename) + DiaFile.GetQ2Name() + ".mzXML").exists()
-                    | !new File(FilenameUtils.getFullPath(DiaFile.Filename) + DiaFile.GetQ3Name() + ".mzXML").exists()) {
-                return;
-            }
-            Logger.getRootLogger().info("=================================================================================================");
-            Logger.getRootLogger().info("Processing " + mzXMLFile);
-            if (!DiaFile.LoadDIASetting()) {
-                Logger.getRootLogger().info("Loading DIA setting failed, job is incomplete");
-                System.exit(1);
-            }
-            if (!DiaFile.LoadParams()) {
-                Logger.getRootLogger().info("Loading parameters failed, job is incomplete");
-                System.exit(1);
-            }
-            Logger.getRootLogger().info("Loading identification results " + mzXMLFile + "....");
-
-            //If the serialization file for ID file existed
-            if (!DiaFile.ReadSerializedLCMSID()) {
-                DiaFile.ParsePepXML(tandemPara);
-                DiaFile.BuildStructure();
-                if (!DiaFile.MS1FeatureMap.ReadPeakCluster()) {
-                    Logger.getRootLogger().info("Loading peak and structure failed, job is incomplete");
-                    System.exit(1);
-                }                
-                DiaFile.MS1FeatureMap.ClearMonoisotopicPeakOfCluster();
-                //Generate mapping between index of precursor feature and pseudo MS/MS scan index 
-                DiaFile.GenerateClusterScanNomapping();
-                //Doing quantification
-                DiaFile.AssignQuant();
-                DiaFile.ClearStructure();
-            }            
-            DiaFile.IDsummary.ReduceMemoryUsage();   
-            time = System.currentTimeMillis() - time;
-            Logger.getRootLogger().info(mzXMLFile + " processed time:" + String.format("%d hour, %d min, %d sec", TimeUnit.MILLISECONDS.toHours(time), TimeUnit.MILLISECONDS.toMinutes(time) - TimeUnit.HOURS.toMinutes(TimeUnit.MILLISECONDS.toHours(time)), TimeUnit.MILLISECONDS.toSeconds(time) - TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(time))));
-        }
-    }
 
 }
diff --git a/DIA_Umpire_Quant/src/dia_umpire_quant/diaumpire_quant.params b/DIA_Umpire_Quant/src/dia_umpire_quant/diaumpire_quant.params
@@ -29,6 +29,7 @@ DecoyPrefix=
 #FDR threshold
 PeptideFDR = 0.01
 ProteinFDR = 0.01
+DataSetLevelPepFDR = false
 
 #Use default protein FDR filtering.
 # Set to true if you wish to filter protein using DIA-Umpire's filtering method based on maximum peptide ion probability values.