diff --git a/.DS_Store b/.DS_Store index 7f42ee8..633f79c 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/Microbiota_analysis_R/.DS_Store b/Microbiota_analysis_R/.DS_Store index cdb17a0..b39a8f9 100644 Binary files a/Microbiota_analysis_R/.DS_Store and b/Microbiota_analysis_R/.DS_Store differ diff --git a/Microbiota_analysis_R/Age_clean_simper.csv b/Microbiota_analysis_R/Age_clean_simper.csv new file mode 100644 index 0000000..e3a9292 --- /dev/null +++ b/Microbiota_analysis_R/Age_clean_simper.csv @@ -0,0 +1,47 @@ +"","Comparison","SIMPER","OTU" +"1","1yr_2w",0.0983760952002948,"Otu00002" +"2","1yr_2w",0.0643429812209152,"Otu00001" +"3","1yr_2w",0.059814420014553,"Otu00003" +"4","1yr_2w",0.043254443848701,"Otu00007" +"5","1yr_2w",0.032500932232854,"Otu00011" +"6","1yr_2w",0.028861881829075,"Otu00006" +"7","1yr_2w",0.024270230681437,"Otu00009" +"8","1yr_2w",0.014654607933432,"Otu00014" +"9","1yr_2w",0.013241473967396,"Otu00022" +"10","1yr_2w",0.013143755400476,"Otu00018" +"11","1yr_2w",0.012431390357538,"Otu00012" +"12","1yr_2w",0.012249980054458,"Otu00016" +"13","1yr_2w",0.011256623251136,"Otu00004" +"14","1yr_2w",0.010129194115769,"Otu00021" +"5003","1yr_8w",0.0376560323802521,"Otu00001" +"5004","1yr_8w",0.0356947491021932,"Otu00005" +"5005","1yr_8w",0.0267585191971277,"Otu00006" +"5006","1yr_8w",0.022153382610294,"Otu00004" +"5007","1yr_8w",0.018614937190912,"Otu00010" +"5008","1yr_8w",0.016007395628362,"Otu00017" +"5009","1yr_8w",0.015165896969144,"Otu00008" +"5010","1yr_8w",0.015137420479094,"Otu00009" +"5011","1yr_8w",0.013887124239801,"Otu00015" +"5012","1yr_8w",0.01348689566677,"Otu00018" +"5013","1yr_8w",0.012573204203004,"Otu00016" +"5014","1yr_8w",0.012514113254386,"Otu00014" +"5015","1yr_8w",0.011375010175498,"Otu00029" +"5016","1yr_8w",0.010601901246815,"Otu00019" +"5017","1yr_8w",0.010400132475867,"Otu00021" +"10005","2w_8w",0.110139034006507,"Otu00002" +"10006","2w_8w",0.070274296564485,"Otu00001" +"10007","2w_8w",0.066265261269552,"Otu00003" +"10008","2w_8w",0.048569279133863,"Otu00007" +"10009","2w_8w",0.039937569065732,"Otu00009" +"10010","2w_8w",0.039334409088334,"Otu00005" +"10011","2w_8w",0.0355700872344279,"Otu00011" +"10012","2w_8w",0.029688138874924,"Otu00004" +"10013","2w_8w",0.024416435440736,"Otu00010" +"10014","2w_8w",0.017672739089483,"Otu00017" +"10015","2w_8w",0.016919957006409,"Otu00008" +"10016","2w_8w",0.01670700772156,"Otu00012" +"10017","2w_8w",0.015305507942289,"Otu00015" +"10018","2w_8w",0.0146779937502001,"Otu00022" +"10019","2w_8w",0.0125368272355489,"Otu00029" +"10020","2w_8w",0.012417999784185,"Otu00013" +"10021","2w_8w",0.011990487616809,"Otu00019" diff --git a/Microbiota_analysis_R/Age_krusk_simper.csv b/Microbiota_analysis_R/Age_krusk_simper.csv new file mode 100644 index 0000000..866b74b --- /dev/null +++ b/Microbiota_analysis_R/Age_krusk_simper.csv @@ -0,0 +1,47 @@ +"","Comparison","SIMPER","OTU","krusk_p.val","fdr_krusk_p.val","Taxonomy","Left mean abund","Left stdev","Right mean abund","Right stdev" +"1","1yr_2w",0.0983760952002948,"Otu00002",0.000451095347614305,0.0013833590660172,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;","7.11845102505695e-06","2.01340199654484e-05","0.196185324113124","0.237964225950146" +"2","1yr_2w",0.0643429812209152,"Otu00001",0.000451095347614305,0.0013833590660172,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;","7.10913950975374e-06","2.01076830229923e-05","0.128370196584475","0.163518290828978" +"3","1yr_2w",0.059814420014553,"Otu00003",0.000331065830938524,0.0013833590660172,"k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;","0","0","0.119333402823358","0.180003455829014" +"4","1yr_2w",0.043254443848701,"Otu00007",0.000331065830938524,0.0013833590660172,"k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;g__Bifidobacterium_unclassified;","0","0","0.0862865503350716","0.160256975806244" +"5","1yr_2w",0.032500932232854,"Otu00011",0.000451095347614305,0.0013833590660172,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Butyricicoccus;s__pullicaecorum;","7.10913950975374e-06","2.01076830229923e-05","0.0648314243899582","0.07929234445691" +"6","1yr_2w",0.028861881829075,"Otu00006",0.00044634624164943,0.0013833590660172,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;f__Ruminococcaceae_unclassified;f__Ruminococcaceae_unclassified;","0.0576048927649059","0.00987764114455286","7.14122486288848e-06","2.01984341061057e-05" +"7","1yr_2w",0.024270230681437,"Otu00009",0.0272785116310676,0.0358517581436888,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__perfringens;","0","0","0.0484260502484597","0.136300018759695" +"8","1yr_2w",0.014654607933432,"Otu00014",0.000559240905749555,0.00151324009791056,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;f__Ruminococcaceae_unclassified;f__Ruminococcaceae_unclassified;","0.0292699697444718","0.00384866523180398","2.14179674789451e-05","4.25004543071092e-05" +"9","1yr_2w",0.013241473967396,"Otu00022",0.000763703739131532,0.00195168733333614,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Dorea;g__Dorea_unclassified;","0.000142720847379308","0.000117843817294513","0.026558447100312","0.0395132558392752" +"10","1yr_2w",0.013143755400476,"Otu00018",0.000331065830938524,0.0013833590660172,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__5-7N15;g__5-7N15_unclassified;","0.0262340097898727","0.00970968730145009","0","0" +"11","1yr_2w",0.012431390357538,"Otu00012",0.0272785116310676,0.0358517581436888,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;g__Bacteroides_unclassified;","0","0","0.0248127940656414","0.0642719067932488" +"12","1yr_2w",0.012249980054458,"Otu00016",0.000331065830938524,0.0013833590660172,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;o__Bacteroidales_unclassified;o__Bacteroidales_unclassified;o__Bacteroidales_unclassified;","0.0244676127767363","0.0148536675150644","0","0" +"13","1yr_2w",0.011256623251136,"Otu00004",0.00374258479486702,0.00748516958973403,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__uniformis;","0","0","0.022476432440375","0.0592444537939672" +"14","1yr_2w",0.010129194115769,"Otu00021",0.000451095347614305,0.0013833590660172,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__[Paraprevotellaceae];g__CF231;g__CF231_unclassified;","0.0202262575161955","0.00976976462053334","7.14122486288848e-06","2.01984341061057e-05" +"15","1yr_8w",0.0376560323802521,"Otu00001",0.000451095347614305,0.0013833590660172,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;","7.10913950975374e-06","2.01076830229923e-05","0.0732926353297153","0.0980374199037812" +"16","1yr_8w",0.0356947491021932,"Otu00005",0.001193745444872,0.00274561452320561,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__S24-7;f__S24-7_unclassified;f__S24-7_unclassified;","0","0","0.0694523480584903","0.109558610377261" +"17","1yr_8w",0.0267585191971277,"Otu00006",0.0110491256127801,0.018824436229181,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;f__Ruminococcaceae_unclassified;f__Ruminococcaceae_unclassified;","0.0576048927649059","0.00987764114455286","0.0150590524259111","0.0337108011066655" +"18","1yr_8w",0.022153382610294,"Otu00004",0.000331065830938524,0.0013833590660172,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__uniformis;","0","0","0.0431226023826681","0.0459506953192825" +"19","1yr_8w",0.018614937190912,"Otu00010",0.00370900999097481,0.00748516958973403,"k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__reuteri;","0","0","0.0362069928081241","0.101899345043138" +"20","1yr_8w",0.016007395628362,"Otu00017",0.0272785116310676,0.0358517581436888,"k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Olsenella;g__Olsenella_unclassified;","0","0","0.0311684825139048","0.0821349438860236" +"21","1yr_8w",0.015165896969144,"Otu00008",0.000331065830938524,0.0013833590660172,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;g__Bacteroides_unclassified;","0","0","0.0295108853230356","0.0264988241909802" +"22","1yr_8w",0.015137420479094,"Otu00009",0.31731050786291,0.33173371276577,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__perfringens;","0","0","0.0294428873763084","0.0832770612859979" +"23","1yr_8w",0.013887124239801,"Otu00015",0.144127034816016,0.161703502476505,"k__Bacteria;p__Spirochaetes;c__Spirochaetes;o__Spirochaetales;f__Spirochaetaceae;g__Treponema;g__Treponema_unclassified;","0","0","0.0270100689858371","0.0607214408149042" +"24","1yr_8w",0.01348689566677,"Otu00018",0.000451095347614305,0.0013833590660172,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__5-7N15;g__5-7N15_unclassified;","0.0262340097898727","0.00970968730145009","7.09944908275118e-06","2.00802743564079e-05" +"25","1yr_8w",0.012573204203004,"Otu00016",0.000331065830938524,0.0013833590660172,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;o__Bacteroidales_unclassified;o__Bacteroidales_unclassified;o__Bacteroidales_unclassified;","0.0244676127767363","0.0148536675150644","0","0" +"26","1yr_8w",0.012514113254386,"Otu00014",0.00112181548065039,0.00271597432157464,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;f__Ruminococcaceae_unclassified;f__Ruminococcaceae_unclassified;","0.0292699697444718","0.00384866523180398","0.00499088359802437","0.00885951016441524" +"27","1yr_8w",0.011375010175498,"Otu00029",0.0106563747212914,0.018824436229181,"k__Bacteria;p__Spirochaetes;c__Spirochaetes;o__Spirochaetales;f__Spirochaetaceae;g__Treponema;g__Treponema_unclassified;","0","0","0.0220500918275586","0.059427091861998" +"28","1yr_8w",0.010601901246815,"Otu00019",0.000547876224134609,0.00151324009791056,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Phascolarctobacterium;g__Phascolarctobacterium_unclassified;","7.10913950975374e-06","2.01076830229923e-05","0.0206080699409765","0.022046054342873" +"29","1yr_8w",0.010400132475867,"Otu00021",0.000331065830938524,0.0013833590660172,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__[Paraprevotellaceae];g__CF231;g__CF231_unclassified;","0.0202262575161955","0.00976976462053334","0","0" +"30","2w_8w",0.110139034006507,"Otu00002",0.0208625823327655,0.0299899621033504,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;","0.196185324113124","0.237964225950146","0.00720522140326036","0.0160106698483778" +"31","2w_8w",0.070274296564485,"Otu00001",0.462249946125543,0.462249946125543,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;","0.128370196584475","0.163518290828978","0.0732926353297153","0.0980374199037812" +"32","2w_8w",0.066265261269552,"Otu00003",0.0356919001168046,0.0443737136587301,"k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;","0.119333402823358","0.180003455829014","0.0105988182042376","0.0212652187725935" +"33","2w_8w",0.048569279133863,"Otu00007",0.00624617080518557,0.0119718273766057,"k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;g__Bifidobacterium_unclassified;","0.0862865503350716","0.160256975806244","0.00150976320771876","0.0026719191915837" +"34","2w_8w",0.039937569065732,"Otu00009",0.16014820803952,0.175400418328998,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__perfringens;","0.0484260502484597","0.136300018759695","0.0294428873763084","0.0832770612859979" +"35","2w_8w",0.039334409088334,"Otu00005",0.00203624953941813,0.00446035613396352,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__S24-7;f__S24-7_unclassified;f__S24-7_unclassified;","1.4282449725777e-05","4.03968682122114e-05","0.0694523480584903","0.109558610377261" +"36","2w_8w",0.0355700872344279,"Otu00011",0.0208625823327655,0.0299899621033504,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Butyricicoccus;s__pullicaecorum;","0.0648314243899582","0.07929234445691","0.00839931950414475","0.0170670893124403" +"37","2w_8w",0.029688138874924,"Otu00004",0.0456798096400443,0.0552966116695273,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__uniformis;","0.022476432440375","0.0592444537939672","0.0431226023826681","0.0459506953192825" +"38","2w_8w",0.024416435440736,"Otu00010",0.0205814484171982,0.0299899621033504,"k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__reuteri;","0.00956528352904612","0.0107021345541737","0.0362069928081241","0.101899345043138" +"39","2w_8w",0.017672739089483,"Otu00017",0.334333694910544,0.341763332575223,"k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Olsenella;g__Olsenella_unclassified;","0.000128817232610937","0.000246282300214037","0.0311684825139048","0.0821349438860236" +"40","2w_8w",0.016919957006409,"Otu00008",0.0742034113297552,0.0875219723376599,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;g__Bacteroides_unclassified;","0.0187242405454188","0.0316691359705659","0.0295108853230356","0.0264988241909802" +"41","2w_8w",0.01670700772156,"Otu00012",0.165806560194013,0.177374459742433,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;g__Bacteroides_unclassified;","0.0248127940656414","0.0642719067932488","0.00876863933332196","0.00930476384159196" +"42","2w_8w",0.015305507942289,"Otu00015",0.144127034816016,0.161703502476505,"k__Bacteria;p__Spirochaetes;c__Spirochaetes;o__Spirochaetales;f__Spirochaetaceae;g__Treponema;g__Treponema_unclassified;","0","0","0.0270100689858371","0.0607214408149042" +"43","2w_8w",0.0146779937502001,"Otu00022",0.0142140388690821,0.0225464064819922,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Dorea;g__Dorea_unclassified;","0.026558447100312","0.0395132558392752","0.00387019362634656","0.00797794814413105" +"44","2w_8w",0.0125368272355489,"Otu00029",0.0106563747212914,0.018824436229181,"k__Bacteria;p__Spirochaetes;c__Spirochaetes;o__Spirochaetales;f__Spirochaetaceae;g__Treponema;g__Treponema_unclassified;","0","0","0.0220500918275586","0.059427091861998" +"45","2w_8w",0.012417999784185,"Otu00013",0.0323916374172107,0.041389314477547,"k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;g__Bacteroides_unclassified;","0.0132660141740724","0.0374988902221327","0.0115530902609724","0.0131682799783904" +"46","2w_8w",0.011990487616809,"Otu00019",0.012839119884336,0.0210928398099805,"k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Phascolarctobacterium;g__Phascolarctobacterium_unclassified;","0.00524474145204471","0.0146039100815662","0.0206080699409765","0.022046054342873" diff --git a/Microbiota_analysis_R/Data/.DS_Store b/Microbiota_analysis_R/Data/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/Microbiota_analysis_R/Data/.DS_Store differ diff --git a/Microbiota_analysis_R/Fig1.pdf b/Microbiota_analysis_R/Fig1.pdf new file mode 100644 index 0000000..021c426 Binary files /dev/null and b/Microbiota_analysis_R/Fig1.pdf differ diff --git a/Microbiota_analysis_R/Fig1.png b/Microbiota_analysis_R/Fig1.png new file mode 100644 index 0000000..496619e Binary files /dev/null and b/Microbiota_analysis_R/Fig1.png differ diff --git a/Microbiota_analysis_R/Fig1.ps b/Microbiota_analysis_R/Fig1.ps new file mode 100644 index 0000000..fe02263 Binary files /dev/null and b/Microbiota_analysis_R/Fig1.ps differ diff --git a/Microbiota_analysis_R/Rpubs/Microbiota Analysis in R.pdf b/Microbiota_analysis_R/Microbiota Analysis in R.pdf similarity index 100% rename from Microbiota_analysis_R/Rpubs/Microbiota Analysis in R.pdf rename to Microbiota_analysis_R/Microbiota Analysis in R.pdf diff --git a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_BRC.Rproj b/Microbiota_analysis_R/Microbiota_Analysis_BRC.Rproj similarity index 94% rename from Microbiota_analysis_R/Rpubs/Microbiota_Analysis_BRC.Rproj rename to Microbiota_analysis_R/Microbiota_Analysis_BRC.Rproj index 3af27f6..8e3c2eb 100755 --- a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_BRC.Rproj +++ b/Microbiota_analysis_R/Microbiota_Analysis_BRC.Rproj @@ -1,13 +1,13 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R.Rmd b/Microbiota_analysis_R/Microbiota_Analysis_in_R.Rmd similarity index 96% rename from Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R.Rmd rename to Microbiota_analysis_R/Microbiota_Analysis_in_R.Rmd index 8e77d65..9a0bb94 100755 --- a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R.Rmd +++ b/Microbiota_analysis_R/Microbiota_Analysis_in_R.Rmd @@ -1,1363 +1,1353 @@ ---- -title: "Microbiota Analysis in R" -author: "Kim Dill-McFarland" -date: "March 20, 2017" -output: - html_document: - toc: true - toc_float: true - toc_depth: 4 - pdf_document: default ---- - -Updated April 15, 2017 - -Online version available at http://rpubs.com/dillmcfarlan/R_microbiotaSOP - -#Tips for this workshop -1. If you have any issues in R, type ??command into the console where "command" is the function you are having issues with and a help page will come up. -2. Lines starting with `#` are comments that are for the reader's benefit. These lines are not code and do not need to be entered into the console. -3. GREY boxes contain code that you can copy and paste to run on your machine. -```{r} -#GREY box -``` -4. WHITE boxes contain sample output of this code, and nothing will happen if you try to copy it into your console. -```{} -WHITE box -``` -5. Basic R code you may find useful: - a. Matrices/data frames are designated by [ , ] where it is [rows, columns] - b. | is or - c. & is and - -#Introduction -Written for R v3.3.2 in RStudio v1.0.136 - -##Goal -The goal of this tutorial is to demonstrate basic analyses of microbiota data to determine if and how communities differ by variables of interest. In general, this pipeline can be used for any microbiota data set that has been clustered into operational taxonomic units (OTUs). - -This tutorial assumes some basic statistical knowledge. Please consider if your data fit the assumptions of each test (normality? equal sampling? Etc.). If you are not familiar with statistics at this level, we strongly recommend collaborating with someone who is. The incorrect use of statistics is a pervasive and serious problem in the sciences so don't become part of the problem! That said, this is an introductory tutorial and there are many, many further analyses that can be done with microbiota data. Hopefully, this is just the start for your data! - -##Data -The data used here were created using 2x250 bp amplicon sequencing of the bacterial V4 region of the 16S rRNA gene on the Illumina MiSeq platform. The full data set is in [Dill-McFarland *et al*. Sci Rep 7: 40864](https://www.ncbi.nlm.nih.gov/pubmed/28098248). Here, we will use a subset of samples. Specifically, we will be correlating the fecal bacterial microbiota of 8 dairy calves at different ages (2 weeks, 8 weeks, 1 year) to variables like weight gain (average daily gain in kg, ADGKG) and gastrointestinal short chain fatty acids (SCFA). - -##Files -We will use the following files created using the [Microbiota Processing in mothur: Standard Operating Procedure (SOP)](https://rpubs.com/dillmcfarlan/mothurSOP). - -* example.final.nn.unique_list.0.03.norm.shared (OTU table) -* example.final.nn.unique_list.0.03.cons.taxonomy (Taxonomy of OTUs) - -We will also be using tab-delimited metadata and SCFA files created in Excel. The metadata includes our metadata (like age and ADGKG) as well as alpha-diversity metrics from `example.final.nn.unique_list.0.03.norm.groups.summary` calculated in mothur. The SCFA table is the mM concentrations of different SCFAs in rumen (stomach) liquids from 1-year-old animals. - -* example.metadata.txt -* example.SCFA.txt - -Finally, we will be loading a number of custom scripts from `Steinberger_scripts` and some a pre-calculated OTU tree `NJ.tree.RData`. The information for creating this tree is provided in this tutorial. - -#Get set up -##Download and install -* Base R: http://cran.mtu.edu/ -* RStudio: https://www.rstudio.com/products/rstudio/download3/ -* Packages: Open RStudio on your computer. If you have not already downloaded these packages, go to the lower right quadrant of your screen and open the Package tab. Click "download" and search for the package you want to download. - + `ape` - + `dplyr` - + `ggplot2` - + `gplots` - + `lme4` - + `phangorn` - + `plotly` - + `tidyr` - + `vegan` - + `VennDiagram` - + `venneuler` - + `phyloseq` (`phyloseq` is not on CRAN, so we have to call it manually. See below.) - -Copy and paste the following into your console. -```{r} -source("https://bioconductor.org/biocLite.R") -biocLite("phyloseq") -``` - -**Note**: If you are having trouble installing packages, turn off your computer's firewall temporarily. - -##Organization -All of our analyses will be organized into a "Project". - -Make a new project by selecting File->New project. Select "New Directory" and "Empty Project". Name the project "Microbiota_Analysis_BRC" and save the project to your Desktop. Place all of your files for this analysis in the folder created on the Desktop - -Create a new R script (File->New file->R script) to save your code. This file will automatically be saved in the project folder. - -Now your screen should look like this - -* Upper left: Where you type and save the code you want to run. -* Upper right: Files you load into and create in R. To view one, click on it and it will open in the upper left pane. -* Lower left: The console. Where commands and outputs run (similar to the one mothur window). -* Lower right: Variable. Explore the different tabs. - -#Data manipulation -##Load Packages -The "library" command tells R to open the package you want to use. You need to do this every time you open R. - -```{r Load packages} -#Analyses of Phylogenetics and Evolution package. Required for tree calculations to be used with phyloseq -library(ape) - -#This package will also help us more easily manipulate our data -library(dplyr) - -#Graphing package used in phyloseq. To edit the default setting of a plot, you need to use functions in this package. -library(ggplot2) - -#This package is used to calculate and plot Venn diagrams as well as heatmaps -library(gplots) - -#Linear mixed-effects models like repeated measures analysis -library(lme4) - -#used to read in mothur-formatted files -library(phangorn) - -#The phyloseq package seeks to address issues with multiple microbiome analysis packages by providing a set of functions that internally manage the organizing, linking, storing, and analyzing of phylogenetic sequencing data. In general, this package is used for UniFrac analyses. -library(phyloseq) - -#A package to create interactive web graphics of use in 3D plots -library(plotly) - -#This package will help us more easily manipulate our data, which are matrices -library(tidyr) - -#The vegan package provides tools for descriptive community ecology. It has most basic functions of diversity analysis, community ordination and dissimilarity analysis. In general, this package is used for Bray-Curtis and Jaccard analyses. -library(vegan) - -#Pretty Venn disgrams -library(VennDiagram) -library(venneuler) -``` - -##Load Data -In the code, the text before = is what the file will be called in R. Make this short but unique as this is how you will tell R to use this file in later commands. - -* header: tells R that the first row is column names, not data -* row.names: tells R that the first column is row names, not data -* sep: tells R that the data are tab-delimited. If you had a comma-delimited file, you would us `sep=","` - -```{r Load data} -#OTU table (shared file) -OTU = read.table("example.final.an.unique_list.0.03.norm.shared", header=TRUE, sep="\t") - -#Taxonomy of each OTU -tax = read.table("example.final.an.unique_list.0.03.cons.taxonomy", header=TRUE, sep="\t") - -#Metadata. Since we made this in Excel, not mothur, we can use the "row.names" modifier to automatically name the rows by the values in the first column (sample names) -meta = read.table("example.metadata.txt", header=TRUE, row.names=1, sep="\t") - -#SCFA data -SCFA = read.table("example.SCFA.txt", header=TRUE, row.names=1, sep="\t") -``` - -##Clean up the data -You can look at your data by clicking on it in the upper-right quadrant "Environment" - -There are several unneeded columns and incorrect formatting in the tables as they were output by mothur. We will now fix them. - -###OTU table -We need to use the "Group" column as the row names so that it will match our metadata -```{r} -row.names(OTU) = OTU$Group -``` - -We then need to remove the "label", "numOTUs", and "Group" columns as they are not OTU counts like the rest of the table -```{r} -OTU.clean = OTU[,-which(names(OTU) %in% c("label", "numOtus", "Group"))] -``` - -###Taxonomy table -For the taxonomy table, we name the rows by the OTU # -```{r} -row.names(tax) = tax$OTU -``` - -Remove all the OTUs that don't occur in our OTU.clean data set -```{r} -tax.clean = tax[row.names(tax) %in% colnames(OTU.clean),] -``` - -We then need to separate the "taxonomy" column so that each level (*i.e.* Domain, Phylum, etc) is in it's own column. We do this with a special command "separate" from the `tidyr` package -```{r} -tax.clean = separate(tax.clean, Taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain"), sep=";") -``` - -Finally, we remove the "Size" and "Strain" columns as well as "OTU" since these are now the row names -```{r} -tax.clean = tax.clean[,-which(names(tax.clean) %in% c("Size", "Strain", "OTU"))] -``` - -###Metadata and SCFA tables -These tables do not require any modification since I created them in Excel exactly as I need them for this R analysis. - -##Order the data -To make viewing and using the data easier, we will make sure our tables have samples (rows) in the same order. Since OTU.clean, meta, and SCFA have sample names as row names, we order by these. -```{r Order the data} -OTU.clean = OTU.clean[order(row.names(OTU.clean)),] -meta = meta[order(row.names(meta)),] -SCFA = SCFA[order(row.names(SCFA)),] -``` - -Our taxonomy table is already in order from OTU1 to OTUN so we do not need to order it. - -##Set seed -We will be running some processes that rely on the random number generater. To make your analysis reproducible, we set the random seed. - -```{r} -set.seed(8765) -``` - -#Alpha-diversity -Alpha-diversity is within sample diversity. It is how many different species (OTUs) are in each sample (richness) and how evenly they are distributed (evenness), which together are diversity. Each sample has one value for each metric. - -![](Diversity_richness.png) -This image illustrates richness vs. diversity. Both forests have the same richness (4 tree species) but Community 1 has much more even distribution of the 4 species while Community 2 is dominated by tree species A. This makes Community 1 more diverse than Community 2. - - -##Explore alpha metrics -Now we will start to look at our data. We will first start with alpha-diversity and richness. Let's plot some common ones here. -```{r} -#Create 2x2 plot environment so that we can see all 4 metrics at once. -par(mfrow = c(2, 2)) - -#Then plot each metric. -hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10) -hist(meta$simpson, main="Simpson diversity", xlab="", breaks=10) -hist(meta$chao, main="Chao richness", xlab="", breaks=15) -hist(meta$ace, main="ACE richness", xlab="", breaks=15) -``` - -You want the data to be roughly normal so that you can run ANOVA or t-tests. If it is not normally distributed, you will need to consider non-parametric tests such as Kruskal-Wallis. - -Here, we see that none of the data are normally distributed. This occurs with the subset but not the full data set because I've specifically selected samples with divergent alpha metrics. In general, you will see roughly normal data for Shannon's diversity as well as most richness metrics. Simpson's diversity, on the other hand, is usually skewed as seen here. - -So most will use inverse Simpson (1/Simpson) instead. This not only increases normalcy but also makes the output more logical as a higher inverse Simpson value corresponds to higher diversity. - -Let's look at inverse Simpson instead. -```{r} -#Create 2x2 plot environment -par(mfrow = c(2, 2)) - -#Plots -hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10) -hist(1/meta$simpson, main="Inverse Simpson diversity", xlab="", breaks=10) -hist(meta$chao, main="Chao richness", xlab="", breaks=15) -hist(meta$ace, main="ACE richness", xlab="", breaks=15) -``` - -Now we see a bimodal distribution for Simpson similar to the richness metrics. - -To test for normalcy statistically, we can run the Shapiro-Wilk test of normality. -```{r} -shapiro.test(meta$shannon) -shapiro.test(1/meta$simpson) -shapiro.test(meta$chao) -shapiro.test(meta$ace) -``` - -We see that, as expected from the graphs, none are normal. - -However, our sample size is small and normalcy tests are very sensitive for small data-sets. In fact, you can run Shapiro-Wilk on a list of 50 values randomly sampled from the R-generated normal distribution and find that they are not normal (even though we know that they are!) - -So, what does this mean for our purposes? Well, we should run statistical tests that don't assume our data is normal, because we don't have any evidence (graphs, Shapiro-Wilk) that it is normal. For demonstration purposes, though, we will run other tests as well. - -Overall, for alpha-diversity: - -* ANOVA, t-test, or general linear models with the normal distribution are used when the data is roughly normal -* Kruskal-Wallis, Wilcoxon rank sum test, or general linear models with another distribution are used when the data is not normal - -Our main variables of interest are - -* AgeGroup: 2w, 8w, 1yr -* ADGKG: 0.05-1.56 kg gained per day (average daily gain kg) - -##Categorical variables -Now that we know which tests can be used, let's run them. - -**Normally distributed metrics** - -Since it's the closest to normalcy, we will use **Shannon's diversity** as an example. First, we will test age, which is a categorical variable with more than 2 levels. Thus, we run ANOVA. If age were only two levels, we could run a t-test - -Does age impact the Shannon diversity of the fecal microbiota? -```{r} -#Run the ANOVA and save it as an object -aov.shannon.age = aov(shannon ~ AgeGroup, data=meta) -#Call for the summary of that ANOVA, which will include P-values -summary(aov.shannon.age) -``` - -To do all the pairwise comparisons between groups and correct for multiple comparisons, we run Tukey's honest significance test of our ANOVA. -```{r} -TukeyHSD(aov.shannon.age) -``` - -We clearly see that all age groups have significantly different diversity. When we plot the data, we see that diversity increases as the animals age. -```{r} -#Re-order the groups because the default is 1yr-2w-8w -meta$AgeGroup.ord = factor(meta$AgeGroup, c("2w","8w","1yr")) -#Return the plot area to 1x1 -par(mfrow = c(1, 1)) -#Plot -boxplot(shannon ~ AgeGroup.ord, data=meta, ylab="Shannon's diversity") -``` - -**Non-normally distributed metrics** - -We will use **Chao's richness estimate** here. Since age is categorical, we use Kruskal-Wallis (non-parametric equivalent of ANOVA). If we have only two levels, we would run Wilcoxon rank sum test (non-parametric equivalent of t-test) -```{r} -kruskal.test(chao ~ AgeGroup, data=meta) -``` - -We can test pairwise within the age groups with Wilcoxon Rank Sum Tests. This test has a slightly different syntax than our other tests -```{r} -pairwise.wilcox.test(meta$chao, meta$AgeGroup, p.adjust.method="fdr") -``` - -Like diversity, we see that richness also increases with age. -```{r} -#Create 1x1 plot environment -par(mfrow = c(1, 1)) -#Plot -boxplot(chao ~ AgeGroup.ord, data=meta, ylab="Chao richness") -``` - -##Continuous variables -For continuous variables, we use general linear models, specifying the distribution that best fits our data. - -**Normally distributed metrics** - -Since ADG is a continuous variable, we run a general linear model. We will again use Shannon's diversity as our roughly normal metric. The default of `glm` and `lm` is the normal distribution so we don't have to specify anything. - -Does ADG impact the Shannon diversity of the fecal microbiota? -```{r} -glm.shannon.ADG = glm(shannon ~ ADGKG, data=meta) -summary(glm.shannon.ADG) -``` - -The output let's us know that the intercept of our model is significantly different from 0 but our slope (*e.g.* our variable of interest) is not. This makes sense when we look at the data. -```{r} -plot(shannon ~ ADGKG, data=meta) -#Add the glm best fit line -abline(glm.shannon.ADG) -``` - -**Non-normally distributed metrics** - -We will again use a general linear model for our non-normally distributed metric Chao. However, this time, we change the distribution from normal to something that fits the data better. - -But which distribution should we choose? In statistics, there is no one "best" model. There are only good and better models. We will use the plot() function to compare two models and pick the better one. - -First, the Gaussian (normal) distribution, which we already know is a bad fit. -```{r} -gaussian.chao.ADG = glm(chao ~ ADGKG, data=meta, family="gaussian") -par(mfrow = c(1,2)) -plot(gaussian.chao.ADG, which=c(1,2)) -``` - -Quasipoisson (log) distribution -```{r} -qp.chao.ADG = glm(chao ~ ADGKG, data=meta, family="quasipoisson") -par(mfrow = c(1,2)) -plot(qp.chao.ADG, which=c(1,2)) -``` - -What we're looking for is no pattern in the Residuals vs. Fitted graph ("stars in the sky"), which shows that we picked a good distribution family to fit our data. We also want our residuals to be normally distributed, which is shown by most/all of the points falling on the line in the Normal Q-Q plot. - -While it's still not perfect, the quasipoisson fits much better with residuals on the order of 30 whereas gaussian was on the order of 600. So, we will use quasipoisson and see that ADG does not to correlate to Chao richness. -```{r} -summary(qp.chao.ADG) -``` - -Plotting this we see that, indeed, there is not signficant correlation between Chao and ADG. -```{r} -#Return the plot area to 1x1 -par(mfrow = c(1, 1)) -#Plot -plot(log(chao) ~ ADGKG, data=meta, ylab="ln(Chao's richness)") -abline(qp.chao.ADG) -``` - -##Mixed models -Our two variables may not be fully independent and therefore, running them in two separate tests may not be correct. That is to say, age may impact ADG. In fact, I know this is the case because calves (2w, 8w) gain weight more quickly than heifers (1yr). - -Think about your variables and what they mean "in the real world." Logically combine them into as few ANOVA tests as possible. In the end, it's better to test a meaningless interaction than not test a meaningful one. - -We can test if the interaction of age and ADG impacts diversity with a model that includes both of our variables. The `*` symbol is a shortcut for models. A*B is equivalent to A + B + A:B -```{r} -aov.shannon.all = aov(shannon ~ AgeGroup*ADGKG, data=meta) -summary(aov.shannon.all) -``` - -We can see that the interaction of age and ADG doesn't significantly impact Shannon diversity, So we should remove that variable to simplify our model. If you had many interaction terms, you would step-wise remove the one with the highest P-value until you had the simplest model with only individual variables and significant interaction terms. -```{r} -aov.shannon.all2 = aov(shannon ~ AgeGroup+ADGKG, data=meta) -summary(aov.shannon.all2) -``` - -Overall, the ANOVA test tells us that only age impacts Shannon diversity but it does not tell us which age groups differ from one another. If all of our variables were categorical, we could run TukeyHSD like we did with age only. -```{r} -TukeyHSD(aov.shannon.all) -``` - -However, you will see that we don't get any data from ADG since it is continuous. There is an error denoting this as "non-factors ignored: ADGKG" - -So, we should have run our test as a glm since we have at least one continuous variable. First, we will still include the interaction variable to see that type of output. -```{r} -glm.shannon.all = glm(shannon ~ AgeGroup*ADGKG, data=meta) -summary(glm.shannon.all) -``` - -Now this output is saying the same thing as ANOVA but in a more complicated way. The function automatically picks a reference group for categorical variables (in this case, 1yr) to compare all other groups to. Let's go through each line - -* (Intercept) - This is whether or not the y-intercept is 0. A significant P-value indicates that the intercept is not 0, and we wouldn't expect it to be for any alpha-diversity metric since 0 means nothing is there - -* AgeGroup2w - the difference between Shannon when Age = 2w vs. 1yr (the same as testing "shannon ~ AgeGroup" and only looking at the 2w-1yr pairwise comparison) -* AgeGroup8w - the same as 2w but now looking at only the 8w-1yr comparison - -* ADGKG - the slope of Shannon to ADGKG (the same as testing "shannon ~ ADGKG") - -* AgeGroup2w:ADGKG - the difference in slope of shannon ~ ADG between ages 2w and 1yr -* AgeGroup8w:ADGKG - the difference in slope of shannon ~ ADG between ages 8w and 1yr - -As we saw in ANOVA, none of the interaction terms are significant so we remove them. -```{r} -glm.shannon.all2 = glm(shannon ~ AgeGroup+ADGKG, data=meta) -summary(glm.shannon.all2) -``` - -**Note**: The full glm model with the interaction term included did not show age as significant. When we remove the interaction term, age is significant. This is why you should remove non-significant interactions terms as they can the mask main effects of individual variables. - -We can run a similar test with non-normal data like Chao. -```{r} -qp.chao.all = glm(chao ~ AgeGroup*ADGKG, data=meta, family="quasipoisson") -summary(qp.chao.all) -``` - -Remove the non-significant interaction. -```{r} -qp.chao.all2 = glm(chao ~ AgeGroup+ADGKG, data=meta, family="quasipoisson") -summary(qp.chao.all2) -``` - -##Repeated measure -Another thing to consider with this data is the fact that we sampled the same animals over time. So, we have a repeated measures design. There are a number of ways to do repeated measures in R. I personally like the `lme4` package used here. - -We add the repeated measure component by adding a random effect for the individual animals with `(1|Animal)` in the `lmer` function. -```{r} -rm.shannon.all = lmer(shannon ~ AgeGroup+ADGKG + (1|Animal), data=meta) -summary(rm.shannon.all) -``` - -We see that very little of the variance in the data is explained by the animal random effects (0.03793). So we actually don't need to include repeated measures in our final model, but it was necessary to check! - - -**From all of this, we can conclude that the fecal microbiota increases in diversity and richness as dairy cows age. Animal growth as measured by ADG does not correlate with fecal community diversity or richness.** - - -#Beta-diversity -Beta-diversity is between sample diversity. It is how different every sample is from every other sample. Thus, each sample has more than one value. Some metrics take abundance into account (*i.e.* diversity: Bray-Curtis, weighted UniFrac) and some only calculate based on presence-absence (*i.e.* richness: Jaccard, unweighted UniFrac). - -Beta-diversity appears like the following (completely made-up numbers) - - . | sample1 | sample2 | sample3 | ... -------- | ------- | ------- | ------- | --- -sample1 | 0 | 0.345 | 0.194 | ... -sample2 | 0.345 | 0 | 0.987 | ... -sample3 | 0.194 | 0.987 | 0 | ... - ... | ... | ... | ... | ... - -##Visualization -The best way to visualize beta-diversity, or how different samples are from each other, is by non-metric multidimensional scaling (nMDS). This is similar to principle coordinate analysis or PCA/PCoA if you've heard of that, only nMDS is more statistically robust with multiple iterations in the form of the `trymax` part of the command. - -Each symbol on an nMDS plot represents the total microbial community of that sample. Symbols closer together have more similar microbiotas while those farther apart have less similar. - -###OTU-based metrics -There are two main type of beta-diversity measures. These OTU-based metrics treat every OTU as a separate entity without taking taxonomy into account. The distance between *Prevotella* OTU1 and *Prevotella* OTU2 is equivalent to the distance between *Prevotella* OTU1 and *Bacteroides* OTU1. - -####Dot plots -First, we calculate the nMDS values for a 2-axis `k=2` graph using the OTU-based Bray-Curtis metric that takes into account both the presence/absence and abundance of OTUs in your samples (*i.e.* diversity). This uses the `metaMDS` function from the package `vegan`. -```{r} -BC.nmds = metaMDS(OTU.clean, distance="bray", k=2, trymax=1000) -``` - -We see that we reached a convergent solution around 20 iterations and our stress is very low (0.06), meaning that 2-axis are sufficient to view the data. - -Then plot the nMDS with different colors for your different groups of interest. We will use colors for our three ages -```{r} -par(mfrow = c(1, 1)) -#Create a blank plot for the nmds -plot(BC.nmds, type="n", main="Bray-Curtis") -#Add the points colored by age -points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) -#Add a legend -legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -``` - -This will create a plot in the lower right quadrant. If you want to get fancy, type "?plot" in the console to see other ways to modify the plot function. - -A similar thing can be done for the Jaccard metric, which only takes into account presence/absence (*i.e.* richness). -```{r} -J.nmds = metaMDS(OTU.clean, distance="jaccard", k=2, trymax=1000) - -plot(J.nmds, type="n", main="Jaccard") -points(J.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) -legend(-3, 1.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -``` - -You see that the values are very different for Jaccard but the pattern of points is very similar to Bray-Curtis. This is because Jaccard is a transformation of Bray-Curtis with J = 2BC/(1+BC) - -####Ellipses -You can also plot standard error (se) ellipses for your nmds data instead of showing all of the individual points. Here, we will plot 99% confidence se ellipses for the Bray-Curtis metric using `ordiellipse` from `vegan`. - -Code courtesy of Madison Cox. -```{r} -plot(BC.nmds, type="n", main="Bray-Curtis") -legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) - -#Add an ellipse for 2w -ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE) - -#Add an ellipse for 8w -ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE) - -#Add an ellipse for 1yr -ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE) -``` - -We clearly see in both the dot and ellipse plots that age significantly impacts the overall structure (Bray-Curtis) and composition (Jaccard) of the fecal bacterial microbiota. - -####3D plots -If your stress is high (like over 0.3) for your `metaMDS` calculation, you probably need to increase to 3 axes `k=3`. Graphing a 3D plot is much more complicated, and there are a number of packages that could be used. Here, we will use one option from the `plotly` package to visualize a 3D Bray-Curtis plot. - -```{r} -#Calculate the Bray-Curtis nMDS for 3-axis -BC.nmds.3D = metaMDS(OTU.clean, distance="bray", k=3, trymax=1000) -``` - -Extract x-y-z values for this nmds -```{r} -BCxyz = scores(BC.nmds.3D, display="sites") -#This is a table that looks like -BCxyz -``` - -Plot the xyz coordinates and color by age -```{r} -plot_ly(x=BCxyz[,1], y=BCxyz[,2], z=BCxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red")) -``` - -**Note**: Since 3D plots are difficult to interpret in printed journal articles, many authors choose to create two separate 2D plots to show the 3D data like so. -```{r} -par(mfrow=c(1,2)) -#Axis 1 and 2 (x and y) -plot(BCxyz[,1], BCxyz[,2], main="Bray-Curtis 1:2", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) -legend(-5.4, 3, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -#Axis 1 and 3 (x and z) -plot(BCxyz[,1], BCxyz[,3], main="Bray-Curtis 1:3", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) -``` - -###Phylogentic-based metrics -The most common of this type of beta-diversity metrics is UniFrac. The strength of UniFrac over Bray-Curtis or Jaccard is that it takes into account phylogenetic relationships of the species present in the microbiota. Thus, samples with different OTUs from the same genus will be more similar by UniFrac that those with OTUs from different genera. The weakness is that UniFrac is more sensitive to low abundance OTUs and those that a very phylogenetically distant. - -Your choice will depend on how much you personally feel phylogenetic relationships vs. sensitively matter in your data. - -Just as above, UniFrac can be plotted as an nMDS. You just need to use a different R package, and thus, slightly different commands. - -####Create physeq object -To start, you must make a `phyloseq` object which includes the OTU.clean, meta, and tax.clean data. We tell R which tables are each type -```{r nMDS_unifrac} -OTU.UF = otu_table(as.matrix(OTU.clean), taxa_are_rows=FALSE) -tax.UF = tax_table(as.matrix(tax.clean)) -meta.UF = sample_data(meta) -``` - -We then merge these into an object of class phyloseq. -```{r} -physeq = phyloseq(OTU.UF, tax.UF, meta.UF) -``` - -To add the phylogenetic component to UniFrac, we calculate a rooted phylogenetic tree of our OTUs. This takes a long time so we have provided the tree for you. - -However, if we were to calculate a tree, first, we import a distance matrix created from representative sequences of our OTUs. We would use `phangorn` to read the file as it was created in mothur as seen under "Trees of OTUs" [here](https://rpubs.com/dillmcfarlan/mothurSOP). - -**DO NOT RUN THIS** -```{} -dist.mat = import_mothur_dist("clean_repFasta.phylip.dist") -``` - -We would then calculate a rooted neighbor-joining tree from the distance matrix using the `ape` package. - -**DO NOT RUN THIS** -```{} -NJ.tree = bionj(dist.mat) -``` - -Instead, we have pre-calculated this tree and you can load is with -```{r} -load("NJ.tree.Rdata") -``` - -Then, add this tree to your physeq object. This object will be what is used in UniFrac calculations. -```{r} -physeq.tree = merge_phyloseq(physeq, NJ.tree) -``` - -We can look at this object and see its components. -```{r} -physeq.tree -``` - -####Dot plots -Calculate weighted UniFrac (*i.e.* diversity) distances and ordinate into an nMDS. We specify weighted with `weighted=TRUE`. -```{r} -wUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=TRUE) -``` - -You can plot UniFrac nMDS using the basic `plot` function as we've done before. -```{r} -par(mfrow=c(1,1)) -plot(wUF.ordu, type="n", main="Weighted UniFrac") -points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) -legend(0.3,0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -``` - -But let's also look at the `ggplot2` package. This package is incredibly powerful and can be customized in many ways. [This document](https://www.rstudio.com/wp-content/uploads/2016/11/ggplot2-cheatsheet-2.1.pdf) has many helpful tips. -```{r} -plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + - scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + - theme_bw() + - ggtitle("Weighted UniFrac") -``` - -Unweighted UniFrac (*i.e.* richness) can be visualized in the same way. We specify unweighted with `weighted=FALSE`. -```{r} -uwUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=FALSE) - -plot_ordination(physeq.tree, uwUF.ordu, type="sites", color="AgeGroup") + - scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + - theme_bw() + - ggtitle("Unweighted UniFrac") -``` - -####Ellipses -Ellipses can be plotted instead of points as well. With the basic plot function: -```{r} -plot(wUF.ordu, type="n", main="Weighted UniFrac") -legend(0.3, 0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) - -#Add an ellipse for 2w -ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE) - -#Add an ellipse for 8w -ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE) - -#Add an ellipse for 1yr -ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE) -``` - -We can also plot ellipses in `ggplot2`. However, these ellipses are not the exact same at the standard error ellipses used with OTU-based metrics as they use different underlying calculations. However, they get at the same question of confidence intervals for groups of points on an nMDS. - -We plot ellipses with `ggplot2` by adding the `stat_ellipse` function to our plot. -```{r} -plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + - scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + - theme_bw() + - stat_ellipse() + - ggtitle("Weighted UniFrac") -``` - -####3D plots -3D UniFrac ordinations are not currently supported by `phyloseq`. We see that our ordinations only include 2 dimensions. -```{r} -wUF.ordu -``` - -But we can instead calculate UniFrac distances using `UniFrac` and ordinating for 3-axes with `metaMDS`. - -```{r} -wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE) -wUF.nmds.3D = metaMDS(wUF.dist, method="NMDS", k=3) -``` - -Then, similar to what we did with Bray-Curtis/Jaccard, we pull out the xyz values and plot with `plotly`. -```{r} -wUFxyz = scores(wUF.nmds.3D, display="sites") -#This is a table that looks like -wUFxyz - -plot_ly(x=wUFxyz[,1], y=wUFxyz[,2], z=wUFxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red")) -``` - -###Vectors for continuous variables -While it is easy to visualize categorical groups with coloring in nMDS, it is difficult to achieve the same effect with continuous variables. Instead, we can fit these variables as a vector on our nMDS plots. - -To do this, we first fit the variables to our distances using the `envfit` function in `vegan`. You can do Bray-Curtis, Jaccard, weighted or unweighted UniFrac. Here, we will demonstrate with Bray-Curtis and weighted UniFrac. -```{r nMDS_vectors} -fit.BC = envfit(BC.nmds, meta) -fit.BC -``` -We see that it has automatically fit every variable in our meta table. - -The simplest way around this is to just ask envfit to run on only the variables you want. -```{r} -fit.BC = envfit(BC.nmds, meta[,c("AgeGroup", "ADGKG")]) -fit.BC -``` - -We repeat for weighted UniFrac -```{r} -fit.wUF = envfit(wUF.ordu, meta[,c("AgeGroup", "ADGKG")]) -fit.wUF -``` -For categorical variables, envfit will label the centroid of the data for each group in the nMDS with that group's name. For continuous variables, it adds an arrow in the direction from smallest to largest value. - -**Note**: The P-values for variables in `envfit` are not equivalent to the P-values for our ANOVA/Kruskal/GLM tests. Instead, `envfit` P-values tell you how well the arrow or centroids fit the *x-y data of the nMDS*, not the underlying distance matrix. In general, if your nMDS is a good representation of the data (low stress value) and the variable was significant in its appropriate ANOVA/Kruskal/GLM test, the fitted arrow/centroids will also be significant. And if your nMDS is a good representation of the data and the variable was *not* significant, the fitted arrow/centroids will also *not* be significant. We see this type of result here, but this will not always be the case. - -However, if your nMDS stress was borderline or not great and/or your variable was borderline significant or not, you may see divergent results for the arrow/centroid. This does not mean that the result you got in ANOVA/Kruskal/GLM was invalid. It just means that it's difficult to visualize this result as a simple arrow or centroids on a 2D plot. Regardless, non-significant variables in `envfit` that you know are signficant in other tests may still be represented on an nMDS as a visual aid. - -Thus, we plot our 2D nMDS colored by age with an arrow for the ADG variable even though that arrow was not significant. Since the ADG variable was also not significant in GLM, we probably won't use these plot in a publication, but it is good practice. - -For Bray-Curtis: -```{r} -plot(BC.nmds, type="n", main="Bray-Curtis") -points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) -legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -#Add fitted variables -plot(fit.BC, col="black") -``` - -You could also ask it to only plot variables with a fit P-value < 0.05. So we would only see the centroids -```{r} -plot(BC.nmds, type="n", main="Bray-Curtis") -points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) -legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -#Add fitted variables -plot(fit.BC, col="black", p.max=0.05) -``` - -Weighted UniFrac -```{r} -plot(wUF.ordu, type="n", main="Weighted UniFrac") -points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) -legend(.3,.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -#Add fitted variables -plot(fit.wUF, col="black") -``` - -You could also fit your OTU.clean table to the nMDS to add arrow(s) for specific OTUs within the plot. OTU arrows that, say, go in the same direction as an age group centroid tend to increase in abundance in that age group. The opposite direction would indicate that an OTU decreases in abundance in that age group. - -Fitting all OTUs would take awhile so we will only fit the first 10 in our table. -```{r} -fit.BC.OTU = envfit(BC.nmds, OTU.clean[,1:10]) -fit.BC.OTU - -#We will only plot significant arrows in this case -plot(BC.nmds, type="n", main="Bray-Curtis") -points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) -legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -#Add fitted variables -plot(fit.BC.OTU, col="black", p.max=0.05) -``` - -You could also think about plotting higher taxonomic levels like summed genera or family groups of OTUs. -```{r} -#Extract all OTUs within the genus Ruminococcus -OTU.Rumino = OTU.clean[,tax.clean$Genus == "g__Ruminococcus"] -#Sum the abundances of the Ruminococcaceae OTUs into one variable (column) -OTU.Rumino$Rumino.sum = rowSums(OTU.Rumino) - -#Fit the new Ruminococcaceae group -fit.BC.Rumino = envfit(BC.nmds, OTU.Rumino$Rumino.sum) -fit.BC.Rumino - -#Plot -plot(BC.nmds, type="n", main="Bray-Curtis") -points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) -legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -#Add fitted variables -plot(fit.BC.Rumino, col="black", labels=c("Ruminococcus")) -``` - -##Statistically test beta-diversity -While nMDS gives us a visual of beta-diversity, it does not test for statistical differences. We do this with permutational analysis of variance (PERMANOVA) or analysis of similarity (ANOSIM). These test whether the overall microbial community differs by your variable of interest. - -You can run them with Bray-Curtis, Jaccard, weighted or unweighted UniFrac to answer different questions. For example, if your variable is significant for Bray-Curtis/weighted UniFrac but not Jaccard/unweighted UniFrac, this means your groups tend to have the same OTUs (richness) but different abundances of those OTUs (diversity). When variables are signficant for Bray-Curtis/Jaccard but not UniFrac, this indicates that your samples have different specific OTUs but similar taxa. Like group 1 has a lot of *Prevotella* OTU1 and group 2 has a lot of *Prevotella* OTU2, but they are both *Prevotella* so UniFrac treats them as being very similar. - -###PERMANOVA -For Bray-Curtis or Jaccard, we use the `vegan` package to calculate distances and run PERMANOVA. As with ANOVA/glm of alpha-diversity, we want to include all variables that could interact in one model. - -**Note**: adonis cannot handle or account for NA or blanks in your data. Subset to only samples with complete metadata before running `vegdist` if these exist. -```{r} -#Calculate distance and save as a matrix -BC.dist=vegdist(OTU.clean, distance="bray") -#Run PERMANOVA on distances. -adonis(BC.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000) -``` - -Similarly for Jaccard -```{r} -J.dist=vegdist(OTU.clean, distance="jaccard") -adonis(J.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000) -``` -We see that the interaction is not significant so we remove it. -```{r} -adonis(BC.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000) -adonis(J.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000) -``` - - -For UniFrac, we use the `phyloseq` package to calculate distances and then `vegan` to run PERMANOVA. -```{r} -wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE) -adonis(wUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000) - -uwUF.dist = UniFrac(physeq.tree, weighted=FALSE, normalized=TRUE) -adonis(uwUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000) -``` -Remove non-significant interaction term -```{r} -adonis(wUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000) -adonis(uwUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000) -``` - -###ANOSIM -If you have very different group sizes, you may consider analysis of similarities (ANOSIM) instead of PERMANOVA. This test does not assume equal group variances. However, it only allows simple 1 variable models with no interactions and can only be used for categorical (AgeGroup), not continuous (ADG) variables. So, ANOSIM has a lot of limitations and should only be used if you group sizes are *very, very* different, like 10 vs 100. - -For example, Bray-Curtis: -```{r} -anosim(BC.dist, meta$AgeGroup, permutations = 1000) -``` - -**Overall, from the nMDS of various beta-diversity metrics (OTU- and phylogenetic-based) and statistical analyses, it is clear that age significantly impacts the fecal microbiota of dairy cows.** - -###2D variables -These analyses are for comparing the microbiota to metadata that cannot fit in a single column and therefore, must be represented as a matrix of its own. For example, PERMANOVA can only tell you that the microbiota differs according to a single short chain fatty acid (SCFA), but other tests can tell you that the microbiota differs according to the overall SCFA profile. This section is also useful for comparing data if you have multiple OTU tables, like for bacteria, archaea, and fungi. - -`Mantel` from `vegan` tests if two distance matrices co-vary *e.g.* does the data in matrix 1 change in the same way as the data in matrix 2. Like PERMANOVA, this test only tells you that the overall data co-vary, not which specific OTUs or SCFAs matter. - -You can only compare samples were you have both types of data so we must subset our OTU table to only the samples that we also have SCFA for. The names are a little different between the tables so we also add ".F" to the SCFA names to make them match -```{r} -OTU.SCFA = OTU.clean[row.names(OTU.clean) %in% paste(row.names(SCFA), ".F", sep=""),] -``` - -We then calculate distance matrices separately for each matrix. It is not necessary to do Bray-Curtis, Jaccard and UniFrac here since our SCFA data does not have any taxonomy to it. -```{r} -dist1 = vegdist(OTU.SCFA) -dist2 = vegdist(SCFA) -``` - -Run a Mantel test comparing the 2 matrices. -```{r} -mantel(dist1, dist2, permutations=100) -``` - -We see that the overall OTU table and SCFA tables do not co-vary. - -You can also run Mantel on 3 matrices at once like so - -**Do not run as we do not have 3 matrices here** -```{} -mantel.partial(dist1, dist2, dist3, permutations=100) -``` - -##Beta dispersion -Sometimes it will be clear from nMDS that one group tends to vary more (be more spread out) than another group. You can test this statistically with multivariate homogeneity of group dispersion (variances). - -Here is an example for Bray-Curtis. We use the same distance matrix we calculated for PERMANOVA/ANOSIM - -Calculate dispersion (variances) within each group. -```{r} -disp.age = betadisper(BC.dist, meta$AgeGroup) -``` - -Perform an ANOVA-like test to determine if the variances differ by groups. -```{r} -permutest(disp.age, pairwise=TRUE, permutations=1000) -``` - -Combining this with our plot, -```{r} -plot(BC.nmds, type="n", main="Bray-Curtis") -legend(.6,-2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) -ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE) -ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE) -ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE) -``` - -we see that 2 week and 8 week calves have similar variability in their fecal microbiotas but that both 2- and 8-week calves have more variable fecal microbiotas than 1-year heifers. - - -#OTUs that differ by -##Categorical variables -Just because the overall microbiota does or does not differ between age groups, does not mean specific OTUs do or don't differ by age. However, it is inadvisable to just test all OTUs in your data set against all variables of interest. Since you are running multiple similar tests, you need to apply a false discovery rate (fdr) correctios and correcting across all OTUs (5002 in this data set) will most likely result in no significant results after fdr correction. Also, you don't want to look at over 5000 P-values, do you? - -There are a number of way to decrease the number of OTUs you're looking at - -1. Don't use OTUs. Add together genus or family groups and test if all or some of these taxa differ across variables of interest -2. Apply an abundance cutoff such as only looking at OTUs/taxa that are at least 1% abundance in at least one sample -3. Apply a frequency cutoff such as only looking at OTUs/taxa that occur in at least 50% of samples -4. Combine 2 and 3 - -However, some of these methods are somewhat arbitrary. How do you pick an abundance or frequency cutoff? What if a low abundant OTU is of interest? And what if you are interested in possible species-level differences (OTUs) so high taxonomic levels aren't useful? - -So, one way to non-arbitrarily select OTUs/taxa of interest is similarity percentages (SIMPER). SIMPER identifies the OTUs that most contribute to beta-diversity measures. These OTUs are the most abundant and/or most variable OTUs in the data set. **Note**: SIMPER outputs all pairwise comparisons (A-B, B-C, A-C, etc.) and thus, only works for categorical variables. - -SIMPER's output is a list of OTUs which cumulatively explain 70%+ of the variation between each comparison. The numbers below the OTUs are **cumulative**, so to get each OTU's contribution, you must subtract the previous OTU's value. - -For example -```{r ID_OTUs_differ} -simper(OTU.clean, meta$AgeGroup, permutations=100) -``` - -We see a number of OTUs that may differ between 1 or more age comparisons. However, these are just the OTUs that most contribute to Bray-Curtis measures between our age groups. *They are not necessarily significantly different.* - -To test significance, we compare the relative abundance of an OTU across our age groups with Kruskal-Wallis (OTU abundance is never normally distributed, trust me). For example, OTU1 occurs in all SIMPER age comparisons and does, in fact, significantly differ by age. -```{r} -kruskal.test(OTU.clean$Otu00001 ~ meta$AgeGroup) -``` - -In contrast, OTU17 occurs in SIMPER but does not actually significantly differ by age group -```{r} -kruskal.test(OTU.clean$Otu00017 ~ meta$AgeGroup) -``` -**Note**: These P-values have not been corrected from false discovery rate (fdr) yet. - -Now, it would be very tedious to individually test every variable of interest in SIMPER and then test every SIMPER OTU in Kruskal-Wallis. So, Andrew Steinberger (Suen lab) has written two scripts to simplify both SIMPER and Kruskal-Wallis of SIMPER OTUs. The latest versions can be found on his [GitHub page](https://github.com/asteinberger9/seq_scripts) and we have provided them for this workshop in `/Steinberger_scripts` - -**Disclaimer** *Andrew has provided these scripts out of the goodness of his heart and provides no guarentee that they will work for your exact data set or with new versions of R/RStudio/vegan. You may contact him through GitHub with issues or errors, but it is not his job to troubleshoot for you. He may or may not address your concerns in an updated version of the scripts at a later time.* - -The use of these scripts are as follows (from Steinberger GitHub with some modifications) - -**simper_pretty.R** - -This script is meant to rapidly perform the SIMPER function from the R package `vegan` for all comparisons of interest in a data set. Inputs are OTU and metadata tables, and the output is a .csv. User can tailor contents of .csv by setting perc_cutoff, low_cutoff, and low_val. This function can also handle taxonomic levels instead of OTU, but currently only select formats are compatible. Requires installation of the R package 'vegan'. - -Usage: - -simper.pretty(x, metrics, c('interesting'), perc_cutoff=0.5, low_cutoff = 'y', low_val=0.01, 'output_name') - -Inputs: - -* x: OTU table -* metrics: metadata table -* interesting: a list of the column headers for the columns of interest in the metrics file. e.g. c('int1','int2','int3') -* perc_cutoff: % cutoff for output OTUs, as decimal (i.e. write 50% as 0.5), larger % increases number OTUs in output. -* low_cutoff: 'y' if want to REMOVE OTUs that contribute less than 1% -* low_val: set value of low cutoff (0.01), ignored if low_cutoff='n'. -* output_name: the name that is appended to the output filename "_clean_simper.csv". - - -**R_krusk.R** - -This script takes the output .csv of `simper_pretty.R`, and the OTU/metadata/taxonomy tables, and performs the non-parametric Kruskal-Wallis rank-sum test on each OTU in the .csv file. Output is a .csv file containing the same contents of simper.pretty output with the following info: p-value, fdr corrected p-value, OTU taxonomic classification (if applicable), mean rel. abund and std dev of otu/tax_lvl in group 1 of comparison, and mean rel. abund and std dev of otu/tax_lvl in group 2 of comparison. Requires installation of R packages 'vegan' and 'dplyr'. - -Usage: - -kruskal.pretty(x, metrics, csv, c('interesting'), 'output_name', taxonomy) - -Inputs: - -* x: OTU table -* metrics: metadata table -* csv: output from simper.pretty, must be imported as data.frame. e.g. csv= data.frame(read.csv("PATH to name_clean_simper.csv")) -* interesting: a list of the column headers for the columns of interest in the metrics file, should be same as simper.pretty inputs. e.g. c('int1','int2','int3') -* output_name= the name that is appended to the output filename "_krusk_simper.csv". -* taxonomy: The .taxonomy file output from classify.otu command in mothur. This is the UNALTERED tax file, not tax.clean (optional) - -First, we load these functions into R. -```{r} -source("Steinberger_scripts/simper_pretty.r") -source("Steinberger_scripts/R_krusk.r") -``` - -Then, we apply them to our data. We will ask for all SIMPER OTUs (`perc_cutoff = 1`, meaning up to cumulative 100%) but cutoff any OTUs that individually contribute less than 1% to SIMPER (`low_val=0.01`). You may want to consider different cutoffs for your data. -```{r} -simper.pretty(OTU.clean, meta, c('AgeGroup'), perc_cutoff=1, low_cutoff = 'y', low_val=0.01, 'Age') - -simper.results = data.frame(read.csv("Age_clean_simper.csv")) -kruskal.pretty(OTU.clean, meta, simper.results, c('AgeGroup'), 'Age', tax) -``` - -If we import the Kruskal-Wallis back into R and select only OTUs there were significantly different after fdr correction (fdr_krusk_p.val)... -```{r} -#Import -KW.results = data.frame(read.csv("Age_krusk_simper.csv")) -#Remove non-significant -KW.results.signif = KW.results[KW.results$fdr_krusk_p.val < 0.05,] -#Order by OTU# -KW.results.signif = KW.results.signif[with(KW.results.signif, order(OTU)),] -head(KW.results.signif) -``` -we see a number of OTU that significantly differ by age group. - -Looking at OTU1 as relative abundance -```{r} -#Calculate abundance -abund = OTU.clean/rowSums(OTU.clean)*100 -#plot -boxplot(abund$Otu00001 ~ meta$AgeGroup.ord, ylab="% Relative abundance", main="OTU1") -``` - -and using the P-values in KW.results.signif, we can say that OTU1 is significantly less abundant in 1yr animals compared to either 2w or 8w calves. - -##Continuous variables -For continuous variables, there is no simple test like SIMPER to pull out OTUs likely to differ across your variable. You could run linear models `glm` of the OTU abundances with different distributions `family=` similar to what we did with Chao richness. However, OTU abundance data is not normal nor does it fit well with other standard distributions due to its many zeros. So, you will need to test a number of distributions and transformations of the data to find a suitable model. - -##Correlations -So, you can also approach continuous variables as correlations. Generally, only strong correlations (r > 0.5 or r < -0.5) should be reported and if you have a lot that fall into the "strong" category, you can up the cut off, say, to r > 0.75 or r < -0.75. There are many correlation options. I like Kendall-Tau because it does not assume linearity or normality. Type ??cor in the R console to learn others that are available. - -Also, consider options to decrease the number of OTUs tested or you will be dealing with a huge table. Like only ones at >X% abundance? Only ones found in SIMPER and/or KW analyses of other important variables? - -Here, we will correlate ADG to OTUs with at least 5% relative abundance in at least one sample in our data set. -```{r} -#Remember we calculated abundance before with -#abund = OTU.clean/rowSums(OTU.clean)*100 - -#Subset OTUs to abundance cutoff -OTU.abund = OTU.clean[, apply(abund, MARGIN=2, function(x) any(x > 5))] - -cor.kendall = cor(OTU.abund, meta$ADGKG, method = "kendall") -cor.kendall -``` - -In this case, we don't see any strong correlations. However, if we did, we could use those OTUs as our list of ones that are of interest to check for significance with glm. - -Next, we will correlate SCFAs with OTUs with at least 1% relative abundance in at least one sample in our data set. We will use only samples for which we also have SCFA data. -```{r} -#Calculate abundances -abund.SCFA = OTU.SCFA/rowSums(OTU.SCFA)*100 - -#Subset OTUs to abundance cutoff -OTU.SCFA.abund = OTU.SCFA[, apply(abund.SCFA, MARGIN=2, function(x) any(x > 1))] - -cor.kendall = cor(OTU.SCFA.abund, SCFA, method = "kendall") -cor.kendall -``` - -If the data table is too large to view in R, you can write it to a table in your project folder. -```{r} -write.table(cor.kendall, file = "cor_kendall.csv", sep = ",") -``` - -We see that some OTUs strongly correlation with a SCFAs. For example, Otu00021 and Otu00025 with Formate - -```{r} -par(mfrow = c(1, 2)) -plot(abund.SCFA$Otu00021 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU21") -plot(abund.SCFA$Otu00025 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU25") -``` - -Clearly we don't have enough data points to make strong conclusions here and the correlations are being driven by one animal with very high formate. However, we could further test the list of OTUs that correlate strongly with SCFAs. We will assume a normal distribution here, but you should assess your models with plot() to make sure they are a good fit. -```{r} -OTU21.Formate = glm(OTU.SCFA$Otu00021 ~ SCFA$Formate) -summary(OTU21.Formate) - -OTU25.Formate = glm(OTU.SCFA$Otu00025 ~ SCFA$Formate) -summary(OTU25.Formate) -``` - -So, we see that these two OTUs do not significantly differ with Formate concentration even though they had very strong Kendall correlations. This is similar to OTUs occuring in SIMPER that do not hold up to subsequent Kruskal-Wallis testing. - -#Other visualizations -##Bar charts -The phyloseq object we created with our OTU, meta, tax, and tree data (physeq.tree) can also be used in a number of other plot functions in the `phyloseq` / `ggplot2` packages. - -Let's explore some of the bar chart options. First, we'll make the classic additive bar chart for phyla in our samples -```{r Bar_charts} -plot_bar(physeq.tree, fill="Phylum") -``` - -We can simplify by grouping our samples by age group -```{r} -plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") -``` - -And removing the lines between OTUs in the bars -```{r} -plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack") -``` - -And only showing the top 5 most abundant phyla -```{r} -#Sort the Phyla by abundance and pick the top 5 -top5P.names = sort(tapply(taxa_sums(physeq.tree), tax_table(physeq.tree)[, "Phylum"], sum), TRUE)[1:5] -#Cut down the physeq.tree data to only the top 10 Phyla -top5P = subset_taxa(physeq.tree, Phylum %in% names(top5P.names)) -#Plot -plot_bar(top5P, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack") -``` - -There are many more options within `ggplot2` to alter this figure. [This document](https://www.rstudio.com/wp-content/uploads/2016/11/ggplot2-cheatsheet-2.1.pdf) has many helpful tips. - -Another way to simplify these bar plots is to not show all OTUs for one sample in one bar. We can do this with facet_grid -```{r} -plot_bar(top5P, x="AgeGroup", fill="Phylum", facet_grid = ~Phylum) + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack") -``` - -And you can break it down at any taxonomic level and color by any other level. - - -##Trees -We can also plot phylogenetic trees and label/modify them by our variables of interest. - -Let's look at the genus *Prevotella* in our data. We want to subset down to just this genus or else our plot would be too cluttered to read. - -Subset by genus -```{r Trees} -prevotella = subset_taxa(physeq.tree, Genus == "g__Prevotella") -``` - -We can see that this worked by comparing the number of taxa in our subset and our original data -```{r} -physeq.tree -prevotella -``` - -We can plot these OTUs on a tree. -```{r} -plot_tree(prevotella, plot.margin = 0.5, ladderize = TRUE) -``` - -In the figure, each OTU is represented by the end branch of the tree. How many samples that OTU occurs in is represented by the black dots. - -Let's make this figure a little more useful and add 1) Colors to the dots for our age groups, 2) Size to the dots to show OTU abundance, and 3) Species level labels for the OTUs - -```{r} -plot_tree(prevotella, color = "AgeGroup", label.tips = "Species", size = "abundance", plot.margin = 0.5, ladderize = TRUE) -``` - -Already it's a little difficult to read. You can view a larger page by clicking "Zoom" above the figure. Or export the figure as a PDF and save as a full page size, 9.5x11. - -There are even more customizable options in this figure. Type ?plot_tree into the console to see the help page explaining all the options. - -##Heat maps -There are some good options in both `phyloseq` and `gplots` to make heatmaps. We will go through `phyloseq` but know that the same things could be done in `gplots` with code specific to that package. - -###OTUs -We're going to just look at the 20 most abundant OTUs to make it more readable. -```{r Heat_maps} -#Sort the OTUs by abundance and pick the top 20 -top20OTU.names = names(sort(taxa_sums(physeq.tree), TRUE)[1:20]) -#Cut down the physeq.tree data to only the top 10 Phyla -top20OTU = prune_taxa(top20OTU.names, physeq.tree) -``` - -We now see that we only have 20 taxa -```{r} -top20OTU -``` - -First, you can make a heatmap of OTU abundance across all samples -```{r} -plot_heatmap(top20OTU) -``` - -And grouped by our age groups -```{r} -plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup") -``` - -We can label the OTU taxa -```{r} -plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus") -``` - -And group OTUs within the same Phyla -```{r} -plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum") -``` - -We can also change the colors (white -> purple), including the 0s/NAs (grey). -```{r} -plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum", low="white", high="purple", na.value="grey") -``` - -You can also have R automatically group your OTUs and samples by beta-diversity. This may yield the most easily interpreted heatmap but if you have a specific research question that is better addressed by your own ordering (like our age groups above), you should stick with that. We'll show Bray-Curtis as an example. Other options are - -* bray -* jaccard -* wunifrac -* uwunifrac - -```{r} -plot_heatmap(top20OTU, "NMDS", "bray", title="Bray-Curtis") -``` - -###Beta-diversity -The other common use for heatmaps is to show distances between samples (*i.e.* beta-diversity) similar to what is shown in nMDS. We have all of the same metric options as we did for nMDS. - -We do not want to use the plot_heatmap() function from `phyloseq` because it requires the input of a physeq object. Instead, we can use our distance matrices as inputs for a `gplots` command. This command will automatically group samples by similarity (trees) -```{r} -#Bray-Curtis -heatmap.2(as.matrix(BC.dist)) - -#UniFrac -heatmap.2(as.matrix(wUF.dist)) -``` - -You could also change the colors -```{r} -#Rainbow colors -rc <- rainbow(nrow(as.matrix(BC.dist)), start=0, end=0.9) -heatmap.2(as.matrix(BC.dist), col=rc) -``` - -As always, for further customization, explore with ?heatmap.2 - -##Venn diagrams -Venn diagram of three samples: 5017.2w.F, 5017.8w.F, and 5017.1yr.F - -Create a list of OTUs that occur (count > 0) in each sample. - -* We select for the row by name with *OTU.clean["name",]* -* We select the columns with a value >0 with *OTU.clean[,apply()]* - -```{r Venn_diagrams} -OTU.5017.2w = colnames(OTU.clean["5017.2w.F", apply(OTU.clean["5017.2w.F",], MARGIN=2, function(x) any(x >0))]) - -OTU.5017.8w = colnames(OTU.clean["5017.8w.F", apply(OTU.clean["5017.8w.F",], MARGIN=2, function(x) any(x >0))]) - -OTU.5017.1yr = colnames(OTU.clean["5017.1yr.F",apply(OTU.clean["5017.1yr.F",], MARGIN=2, function(x) any(x >0))]) -``` - -We can then use these lists of OTUs to plot a Venn diagram with venn() from the `gplots` package -```{r} -venn(list(OTU.5017.2w, OTU.5017.8w, OTU.5017.1yr)) -``` - -We can also do this for our age groups by selecting all samples where meta$AgeGroup = 2w, 8w, or 1yr -```{r} -OTU.2w = colnames(OTU.clean[meta$AgeGroup == "2w", apply(OTU.clean[meta$AgeGroup == "2w",], MARGIN=2, function(x) any(x >0))]) - -OTU.8w = colnames(OTU.clean[meta$AgeGroup == "8w", apply(OTU.clean[meta$AgeGroup == "8w",], MARGIN=2, function(x) any(x >0))]) - -OTU.1yr = colnames(OTU.clean[meta$AgeGroup == "1yr", apply(OTU.clean[meta$AgeGroup == "1yr",], MARGIN=2, function(x) any(x >0))]) -``` - -And plot -```{r} -venn(list(OTU.2w, OTU.8w, OTU.1yr)) -``` - -These are not the prettiest Venns, but they are the quickest way to calculate the values within a Venn. - -Once you have these, you can use the VennDiagram package for more pretty graphing options. For example, the age groups venns would be -```{r} -draw.triple.venn(area1 = 385+58+71+320, area2 = 801+190+320+71, area3 = 3177+190+58+71, n12 = 320+71, n23 = 190+71, n13 = 58+71, n123 = 71, category = c("2w", "8w", "1yr"), lty = "blank", fill = c("green", "red", "blue")) -``` - -Or with venneuler, you can scale the circles to be porportional to the total number of OTUs in that group -```{r} -#Create a venneuler object -age.venn=venneuler(c('A' = 385+58+71+320, 'B' = 801+190+320+71, 'C' = 3177+190+58+71, 'A&B' = 320+71, 'B&C' = 190+71, 'A&C' = 58+71, 'A&B&C' = 71)) - -#Add group names -age.venn$labels = c("2w", "8w", "1yr") - -#Plot -plot(age.venn) -``` - -Or we can export the OTU lists and make Venns with this online tool http://bioinformatics.psb.ugent.be/webtools/Venn/. This tool is handy in that is gives you the list of OTUs within the Venn sections so that you can see which specific bacteria are shared. -```{r} -write.table(OTU.2w, "OTU.2w.csv", sep=",", row.names=FALSE, col.names=FALSE) -write.table(OTU.8w, "OTU.8w.csv", sep=",", row.names=FALSE, col.names=FALSE) -write.table(OTU.1yr, "OTU.1yr.csv", sep=",", row.names=FALSE, col.names=FALSE) -``` - -##Networks -###OTUs -You can plot the distances between OTUs as a network. It would be an unreadable mess to plot all the OTUs in our data set, so we will just use the smaller prevotella data set. -```{r} -plot_net(prevotella, color="Species", type="taxa") -``` - -For co-occurrence networks of OTUs, I recommend [Gephi](https://gephi.org/) or [Cytoscape](http://www.cytoscape.org/). Thus far, I have not found an R package comparable to these other programs. - -###Beta-diversity -You can also plot beta-diversity as a network where the edges (lines) are the distances between samples. All metrics we've used here are supported (bray, jaccard, wunifrac, uwunifrac) - -```{r} -plot_net(physeq.tree, color="AgeGroup", distance="bray") -``` - -#Publication figures -Once you have a figure you want to include in a publication, there are a number of ways to export it out of R. You can use the "Export" function within the Plots window, but this often does not result in high enough resolution. - -Ideally, you want to save in PostScript (.ps) or PDF (.pdf) formats because they are vector-based, meaning they are not any specific dpi and do not get blurry when zoomed in. Other formats (PNG, JPG, BMP, TIFF) are pixel-based formats (little square dots) and can become jagged when zoomed in. - -If you have issues getting a specific font to work, try installing and loading the package `extrafont`. - -##PostScript -Here, we will use `postscript` to export as a `.ps`. This function uses - -* width, height: in inches unless otherwise specified with `units=` -* horizontal: TRUE = landscape, FALSE = portrait -* colormodel: RGB, CMYK, and others -* family: Font to be used within figures - -Then we add `layout` if we have more than one plot within the overall figure. - -* matrix: - + A list of how many figures there are. For 2, it is c(1,2). For 4, it is c(1,2,3,4) - + Then the number of rows, columns the figures should be oriented in -* widths: A list of scalars of how large each figure should be in width. -* heights: A list of scalars of how large each figure should be in height. - -```{r} -postscript("Fig1.ps", width = 7, height = 3, horizontal = FALSE, colormodel = "rgb", family = "ArialMT") - -layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1)) - -plot(BC.nmds, type="n", main="Bray-Curtis") -points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) - -boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue")) - -dev.off() -``` - -To open the resulting `.ps` file: - -* Open it directly in Adobe Illustrator (vectors are preserved) -* On a Mac, double-clicking on it will convert it automatically into a PDF and will open automatically into Preview. -* On Windows, it depends on how "file associations" are set-up. Typically the file would need some transformation on a "standard" Windows computer before it can be used. If Adobe software is installed, it could run via Distiller to convert the .ps to a PDF. - -##PDF -To export directly to a PDF, we will use `pdf` - -```{r} -pdf("Fig1.pdf", width = 7, height = 3, colormodel = "rgb", family = "ArialMT") - -layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1)) - -plot(BC.nmds, type="n", main="Bray-Curtis") -points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) - -boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue")) - -dev.off() -``` - -##PNG -PNG is pixel-based so it may get blurry if not at high enough resolution. The exact resolution can be specified by giving the dpi in `res=` - -```{r} -png("Fig1.png", width = 7, height = 3, units='in', res=300) - -layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1)) - -plot(BC.nmds, type="n", main="Bray-Curtis") -points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) - -boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue")) - -dev.off() -``` - +--- +title: "Microbiota Analysis in R" +author: "Kim Dill-McFarland" +date: "March 20, 2017" +output: + html_document: + toc: true + toc_float: true + pdf_document: default +editor_options: + chunk_output_type: console +always_allow_html: yes +--- + +Updated December 19, 2017 + +Online version available at http://rpubs.com/dillmcfarlan/R_microbiotaSOP + +#Tips for this workshop +1. If you have any issues in R, type ??command into the console where "command" is the function you are having issues with and a help page will come up. +2. Lines starting with `#` are comments that are for the reader's benefit. These lines are not code and do not need to be entered into the console. +3. GREY boxes contain code that you can copy and paste to run on your machine. +```{r} +#GREY box +``` +4. WHITE boxes contain sample output of this code, and nothing will happen if you try to copy it into your console. +```{} +WHITE box +``` +5. Basic R code you may find useful: + a. Matrices/data frames are designated by [ , ] where it is [rows, columns] + b. | is or + c. & is and + +#Introduction +Written for R v3.3.2 in RStudio v1.0.136 + +##Goal +The goal of this tutorial is to demonstrate basic analyses of microbiota data to determine if and how communities differ by variables of interest. In general, this pipeline can be used for any microbiota data set that has been clustered into operational taxonomic units (OTUs). + +This tutorial assumes some basic statistical knowledge. Please consider if your data fit the assumptions of each test (normality? equal sampling? Etc.). If you are not familiar with statistics at this level, we strongly recommend collaborating with someone who is. The incorrect use of statistics is a pervasive and serious problem in the sciences so don't become part of the problem! That said, this is an introductory tutorial and there are many, many further analyses that can be done with microbiota data. Hopefully, this is just the start for your data! + +##Data +The data used here were created using 2x250 bp amplicon sequencing of the bacterial V4 region of the 16S rRNA gene on the Illumina MiSeq platform. The full data set is in [Dill-McFarland *et al*. Sci Rep 7: 40864](https://www.ncbi.nlm.nih.gov/pubmed/28098248). Here, we will use a subset of samples. Specifically, we will be correlating the fecal bacterial microbiota of 8 dairy calves at different ages (2 weeks, 8 weeks, 1 year) to variables like weight gain (average daily gain in kg, ADGKG) and gastrointestinal short chain fatty acids (SCFA). + +##Files +We will use the following files created using the [Microbiota Processing in mothur: Standard Operating Procedure (SOP)](https://rpubs.com/dillmcfarlan/mothurSOP). + +* example.final.nn.unique_list.0.03.norm.shared (OTU table) +* example.final.nn.unique_list.0.03.cons.taxonomy (Taxonomy of OTUs) + +We will also be using tab-delimited metadata and SCFA files created in Excel. The metadata includes our metadata (like age and ADGKG) as well as alpha-diversity metrics from `example.final.nn.unique_list.0.03.norm.groups.summary` calculated in mothur. The SCFA table is the mM concentrations of different SCFAs in rumen (stomach) liquids from 1-year-old animals. + +* example.metadata.txt +* example.SCFA.txt + +Finally, we will be loading a number of custom scripts from `Steinberger_scripts` and some a pre-calculated OTU tree `NJ.tree.RData`. The information for creating this tree is provided in this tutorial. + +**All data can be downloaded from [GitHub](https://github.com/kdillmcfarland/workshops_UW_Madison)** + +#Get set up +##Download and install +* Base R: http://cran.mtu.edu/ +* RStudio: https://www.rstudio.com/products/rstudio/download3/ +* Packages: Open RStudio on your computer. If you have not already downloaded these packages, go to the lower right quadrant of your screen and open the Package tab. Click "download" and search for the package you want to download. + + `ape` + + `dplyr` + + `ggplot2` + + `gplots` + + `lme4` + + `phangorn` + + `plotly` + + `tidyr` + + `vegan` + + `VennDiagram` + + `phyloseq` (`phyloseq` is not on CRAN, so we have to call it manually. See below.) + +Copy and paste the following into your console. +```{r} +source("https://bioconductor.org/biocLite.R") +biocLite("phyloseq") +``` + +**Note**: If you are having trouble installing packages, turn off your computer's firewall temporarily. + +##Organization +All of our analyses will be organized into a "Project". + +Make a new project by selecting File->New project. Select "New Directory" and "Empty Project". Name the project "Microbiota_Analysis_BRC" and save the project to your Desktop. Place all of your files for this analysis in the folder created on the Desktop + +Create a new R script (File->New file->R script) to save your code. This file will automatically be saved in the project folder. + +Now your screen should look like this + +* Upper left: Where you type and save the code you want to run. +* Upper right: Files you load into and create in R. To view one, click on it and it will open in the upper left pane. +* Lower left: The console. Where commands and outputs run (similar to the one mothur window). +* Lower right: Variable. Explore the different tabs. + +#Data manipulation +##Load Packages +The "library" command tells R to open the package you want to use. You need to do this every time you open R. + +```{r Load packages} +#Analyses of Phylogenetics and Evolution package. Required for tree calculations to be used with phyloseq +library(ape) + +#This package will also help us more easily manipulate our data +library(dplyr) + +#Graphing package used in phyloseq. To edit the default setting of a plot, you need to use functions in this package. +library(ggplot2) + +#This package is used to calculate and plot Venn diagrams as well as heatmaps +library(gplots) + +#Linear mixed-effects models like repeated measures analysis +library(lme4) + +#used to read in mothur-formatted files +library(phangorn) + +#The phyloseq package seeks to address issues with multiple microbiome analysis packages by providing a set of functions that internally manage the organizing, linking, storing, and analyzing of phylogenetic sequencing data. In general, this package is used for UniFrac analyses. +library(phyloseq) + +#A package to create interactive web graphics of use in 3D plots +library(plotly) + +#This package will help us more easily manipulate our data, which are matrices +library(tidyr) + +#The vegan package provides tools for descriptive community ecology. It has most basic functions of diversity analysis, community ordination and dissimilarity analysis. In general, this package is used for Bray-Curtis and Jaccard analyses. +library(vegan) + +#Pretty Venn disgrams +library(VennDiagram) +``` + +##Load Data +In the code, the text before = is what the file will be called in R. Make this short but unique as this is how you will tell R to use this file in later commands. + +* header: tells R that the first row is column names, not data +* row.names: tells R that the first column is row names, not data +* sep: tells R that the data are tab-delimited. If you had a comma-delimited file, you would us `sep=","` + +```{r Load data} +#OTU table (shared file) +OTU = read.table("Data/example.final.an.unique_list.0.03.norm.shared", header=TRUE, sep="\t") + +#Taxonomy of each OTU +tax = read.table("Data/example.final.an.unique_list.0.03.cons.taxonomy", header=TRUE, sep="\t") + +#Metadata. Since we made this in Excel, not mothur, we can use the "row.names" modifier to automatically name the rows by the values in the first column (sample names) +meta = read.table("Data/example.metadata.txt", header=TRUE, row.names=1, sep="\t") + +#SCFA data +SCFA = read.table("Data/example.SCFA.txt", header=TRUE, row.names=1, sep="\t") +``` + +##Clean up the data +You can look at your data by clicking on it in the upper-right quadrant "Environment" + +There are several unneeded columns and incorrect formatting in the tables as they were output by mothur. We will now fix them. + +###OTU table +We need to use the "Group" column as the row names so that it will match our metadata +```{r} +row.names(OTU) = OTU$Group +``` + +We then need to remove the "label", "numOTUs", and "Group" columns as they are not OTU counts like the rest of the table +```{r} +OTU.clean = OTU[,-which(names(OTU) %in% c("label", "numOtus", "Group"))] +``` + +###Taxonomy table +For the taxonomy table, we name the rows by the OTU # +```{r} +row.names(tax) = tax$OTU +``` + +Remove all the OTUs that don't occur in our OTU.clean data set +```{r} +tax.clean = tax[row.names(tax) %in% colnames(OTU.clean),] +``` + +We then need to separate the "taxonomy" column so that each level (*i.e.* Domain, Phylum, etc) is in it's own column. We do this with a special command "separate" from the `tidyr` package +```{r} +tax.clean = separate(tax.clean, Taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain"), sep=";") +``` + +Finally, we remove the "Size" and "Strain" columns as well as "OTU" since these are now the row names +```{r} +tax.clean = tax.clean[,-which(names(tax.clean) %in% c("Size", "Strain", "OTU"))] +``` + +###Metadata and SCFA tables +These tables do not require any modification since I created them in Excel exactly as I need them for this R analysis. + +##Order the data +To make viewing and using the data easier, we will make sure our tables have samples (rows) in the same order. Since OTU.clean, meta, and SCFA have sample names as row names, we order by these. +```{r Order the data} +OTU.clean = OTU.clean[order(row.names(OTU.clean)),] +meta = meta[order(row.names(meta)),] +SCFA = SCFA[order(row.names(SCFA)),] +``` + +Our taxonomy table is already in order from OTU1 to OTUN so we do not need to order it. + +##Set seed +We will be running some processes that rely on the random number generater. To make your analysis reproducible, we set the random seed. + +```{r} +set.seed(8765) +``` + +#Alpha-diversity +Alpha-diversity is within sample diversity. It is how many different species (OTUs) are in each sample (richness) and how evenly they are distributed (evenness), which together are diversity. Each sample has one value for each metric. + +![](Diversity_richness.png) +This image illustrates richness vs. diversity. Both forests have the same richness (4 tree species) but Community 1 has much more even distribution of the 4 species while Community 2 is dominated by tree species A. This makes Community 1 more diverse than Community 2. + + +##Explore alpha metrics +Now we will start to look at our data. We will first start with alpha-diversity and richness. Let's plot some common ones here. +```{r} +#Create 2x2 plot environment so that we can see all 4 metrics at once. +par(mfrow = c(2, 2)) + +#Then plot each metric. +hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10) +hist(meta$simpson, main="Simpson diversity", xlab="", breaks=10) +hist(meta$chao, main="Chao richness", xlab="", breaks=15) +hist(meta$ace, main="ACE richness", xlab="", breaks=15) +``` + +You want the data to be roughly normal so that you can run ANOVA or t-tests. If it is not normally distributed, you will need to consider non-parametric tests such as Kruskal-Wallis. + +Here, we see that none of the data are normally distributed. This occurs with the subset but not the full data set because I've specifically selected samples with divergent alpha metrics. In general, you will see roughly normal data for Shannon's diversity as well as most richness metrics. Simpson's diversity, on the other hand, is usually skewed as seen here. + +So most will use inverse Simpson (1/Simpson) instead. This not only increases normalcy but also makes the output more logical as a higher inverse Simpson value corresponds to higher diversity. + +Let's look at inverse Simpson instead. +```{r} +#Create 2x2 plot environment +par(mfrow = c(2, 2)) + +#Plots +hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10) +hist(1/meta$simpson, main="Inverse Simpson diversity", xlab="", breaks=10) +hist(meta$chao, main="Chao richness", xlab="", breaks=15) +hist(meta$ace, main="ACE richness", xlab="", breaks=15) +``` + +Now we see a bimodal distribution for Simpson similar to the richness metrics. + +To test for normalcy statistically, we can run the Shapiro-Wilk test of normality. +```{r} +shapiro.test(meta$shannon) +shapiro.test(1/meta$simpson) +shapiro.test(meta$chao) +shapiro.test(meta$ace) +``` + +We see that, as expected from the graphs, none are normal. + +However, our sample size is small and normalcy tests are very sensitive for small data-sets. In fact, you can run Shapiro-Wilk on a list of 50 values randomly sampled from the R-generated normal distribution and find that they are not normal (even though we know that they are!) + +So, what does this mean for our purposes? Well, we should run statistical tests that don't assume our data is normal, because we don't have any evidence (graphs, Shapiro-Wilk) that it is normal. For demonstration purposes, though, we will run other tests as well. + +Overall, for alpha-diversity: + +* ANOVA, t-test, or general linear models with the normal distribution are used when the data is roughly normal +* Kruskal-Wallis, Wilcoxon rank sum test, or general linear models with another distribution are used when the data is not normal + +Our main variables of interest are + +* AgeGroup: 2w, 8w, 1yr +* ADGKG: 0.05-1.56 kg gained per day (average daily gain kg) + +##Categorical variables +Now that we know which tests can be used, let's run them. + +**Normally distributed metrics** + +Since it's the closest to normalcy, we will use **Shannon's diversity** as an example. First, we will test age, which is a categorical variable with more than 2 levels. Thus, we run ANOVA. If age were only two levels, we could run a t-test + +Does age impact the Shannon diversity of the fecal microbiota? +```{r} +#Run the ANOVA and save it as an object +aov.shannon.age = aov(shannon ~ AgeGroup, data=meta) +#Call for the summary of that ANOVA, which will include P-values +summary(aov.shannon.age) +``` + +To do all the pairwise comparisons between groups and correct for multiple comparisons, we run Tukey's honest significance test of our ANOVA. +```{r} +TukeyHSD(aov.shannon.age) +``` + +We clearly see that all age groups have significantly different diversity. When we plot the data, we see that diversity increases as the animals age. +```{r} +#Re-order the groups because the default is 1yr-2w-8w +meta$AgeGroup.ord = factor(meta$AgeGroup, c("2w","8w","1yr")) +#Return the plot area to 1x1 +par(mfrow = c(1, 1)) +#Plot +boxplot(shannon ~ AgeGroup.ord, data=meta, ylab="Shannon's diversity") +``` + +**Non-normally distributed metrics** + +We will use **Chao's richness estimate** here. Since age is categorical, we use Kruskal-Wallis (non-parametric equivalent of ANOVA). If we have only two levels, we would run Wilcoxon rank sum test (non-parametric equivalent of t-test) +```{r} +kruskal.test(chao ~ AgeGroup, data=meta) +``` + +We can test pairwise within the age groups with Wilcoxon Rank Sum Tests. This test has a slightly different syntax than our other tests +```{r} +pairwise.wilcox.test(meta$chao, meta$AgeGroup, p.adjust.method="fdr") +``` + +Like diversity, we see that richness also increases with age. +```{r} +#Create 1x1 plot environment +par(mfrow = c(1, 1)) +#Plot +boxplot(chao ~ AgeGroup.ord, data=meta, ylab="Chao richness") +``` + +##Continuous variables +For continuous variables, we use general linear models, specifying the distribution that best fits our data. + +**Normally distributed metrics** + +Since ADG is a continuous variable, we run a general linear model. We will again use Shannon's diversity as our roughly normal metric. The default of `glm` and `lm` is the normal distribution so we don't have to specify anything. + +Does ADG impact the Shannon diversity of the fecal microbiota? +```{r} +glm.shannon.ADG = glm(shannon ~ ADGKG, data=meta) +summary(glm.shannon.ADG) +``` + +The output let's us know that the intercept of our model is significantly different from 0 but our slope (*e.g.* our variable of interest) is not. This makes sense when we look at the data. +```{r} +plot(shannon ~ ADGKG, data=meta) +#Add the glm best fit line +abline(glm.shannon.ADG) +``` + +**Non-normally distributed metrics** + +We will again use a general linear model for our non-normally distributed metric Chao. However, this time, we change the distribution from normal to something that fits the data better. + +But which distribution should we choose? In statistics, there is no one "best" model. There are only good and better models. We will use the plot() function to compare two models and pick the better one. + +First, the Gaussian (normal) distribution, which we already know is a bad fit. +```{r} +gaussian.chao.ADG = glm(chao ~ ADGKG, data=meta, family="gaussian") +par(mfrow = c(1,2)) +plot(gaussian.chao.ADG, which=c(1,2)) +``` + +Quasipoisson (log) distribution +```{r} +qp.chao.ADG = glm(chao ~ ADGKG, data=meta, family="quasipoisson") +par(mfrow = c(1,2)) +plot(qp.chao.ADG, which=c(1,2)) +``` + +What we're looking for is no pattern in the Residuals vs. Fitted graph ("stars in the sky"), which shows that we picked a good distribution family to fit our data. We also want our residuals to be normally distributed, which is shown by most/all of the points falling on the line in the Normal Q-Q plot. + +While it's still not perfect, the quasipoisson fits much better with residuals on the order of 30 whereas gaussian was on the order of 600. So, we will use quasipoisson and see that ADG does not to correlate to Chao richness. +```{r} +summary(qp.chao.ADG) +``` + +Plotting this we see that, indeed, there is not signficant correlation between Chao and ADG. +```{r} +#Return the plot area to 1x1 +par(mfrow = c(1, 1)) +#Plot +plot(log(chao) ~ ADGKG, data=meta, ylab="ln(Chao's richness)") +abline(qp.chao.ADG) +``` + +##Mixed models +Our two variables may not be fully independent and therefore, running them in two separate tests may not be correct. That is to say, age may impact ADG. In fact, I know this is the case because calves (2w, 8w) gain weight more quickly than heifers (1yr). + +Think about your variables and what they mean "in the real world." Logically combine them into as few ANOVA tests as possible. In the end, it's better to test a meaningless interaction than not test a meaningful one. + +We can test if the interaction of age and ADG impacts diversity with a model that includes both of our variables. The `*` symbol is a shortcut for models. A*B is equivalent to A + B + A:B +```{r} +aov.shannon.all = aov(shannon ~ AgeGroup*ADGKG, data=meta) +summary(aov.shannon.all) +``` + +We can see that the interaction of age and ADG doesn't significantly impact Shannon diversity, So we should remove that variable to simplify our model. If you had many interaction terms, you would step-wise remove the one with the highest P-value until you had the simplest model with only individual variables and significant interaction terms. +```{r} +aov.shannon.all2 = aov(shannon ~ AgeGroup+ADGKG, data=meta) +summary(aov.shannon.all2) +``` + +Overall, the ANOVA test tells us that only age impacts Shannon diversity but it does not tell us which age groups differ from one another. If all of our variables were categorical, we could run TukeyHSD like we did with age only. +```{r} +TukeyHSD(aov.shannon.all) +``` + +However, you will see that we don't get any data from ADG since it is continuous. There is an error denoting this as "non-factors ignored: ADGKG" + +So, we should have run our test as a glm since we have at least one continuous variable. First, we will still include the interaction variable to see that type of output. +```{r} +glm.shannon.all = glm(shannon ~ AgeGroup*ADGKG, data=meta) +summary(glm.shannon.all) +``` + +Now this output is saying the same thing as ANOVA but in a more complicated way. The function automatically picks a reference group for categorical variables (in this case, 1yr) to compare all other groups to. Let's go through each line + +* (Intercept) - This is whether or not the y-intercept is 0. A significant P-value indicates that the intercept is not 0, and we wouldn't expect it to be for any alpha-diversity metric since 0 means nothing is there + +* AgeGroup2w - the difference between Shannon when Age = 2w vs. 1yr (the same as testing "shannon ~ AgeGroup" and only looking at the 2w-1yr pairwise comparison) +* AgeGroup8w - the same as 2w but now looking at only the 8w-1yr comparison + +* ADGKG - the slope of Shannon to ADGKG (the same as testing "shannon ~ ADGKG") + +* AgeGroup2w:ADGKG - the difference in slope of shannon ~ ADG between ages 2w and 1yr +* AgeGroup8w:ADGKG - the difference in slope of shannon ~ ADG between ages 8w and 1yr + +As we saw in ANOVA, none of the interaction terms are significant so we remove them. +```{r} +glm.shannon.all2 = glm(shannon ~ AgeGroup+ADGKG, data=meta) +summary(glm.shannon.all2) +``` + +**Note**: The full glm model with the interaction term included did not show age as significant. When we remove the interaction term, age is significant. This is why you should remove non-significant interactions terms as they can the mask main effects of individual variables. + +We can run a similar test with non-normal data like Chao. +```{r} +qp.chao.all = glm(chao ~ AgeGroup*ADGKG, data=meta, family="quasipoisson") +summary(qp.chao.all) +``` + +Remove the non-significant interaction. +```{r} +qp.chao.all2 = glm(chao ~ AgeGroup+ADGKG, data=meta, family="quasipoisson") +summary(qp.chao.all2) +``` + +##Repeated measure +Another thing to consider with this data is the fact that we sampled the same animals over time. So, we have a repeated measures design. There are a number of ways to do repeated measures in R. I personally like the `lme4` package used here. + +We add the repeated measure component by adding a random effect for the individual animals with `(1|Animal)` in the `lmer` function. +```{r} +rm.shannon.all = lmer(shannon ~ AgeGroup+ADGKG + (1|Animal), data=meta) +summary(rm.shannon.all) +``` + +We see that very little of the variance in the data is explained by the animal random effects (0.03793). So we actually don't need to include repeated measures in our final model, but it was necessary to check! + + +**From all of this, we can conclude that the fecal microbiota increases in diversity and richness as dairy cows age. Animal growth as measured by ADG does not correlate with fecal community diversity or richness.** + + +#Beta-diversity +Beta-diversity is between sample diversity. It is how different every sample is from every other sample. Thus, each sample has more than one value. Some metrics take abundance into account (*i.e.* diversity: Bray-Curtis, weighted UniFrac) and some only calculate based on presence-absence (*i.e.* richness: Jaccard, unweighted UniFrac). + +Beta-diversity appears like the following (completely made-up numbers) + + . | sample1 | sample2 | sample3 | ... +------- | ------- | ------- | ------- | --- +sample1 | 0 | 0.345 | 0.194 | ... +sample2 | 0.345 | 0 | 0.987 | ... +sample3 | 0.194 | 0.987 | 0 | ... + ... | ... | ... | ... | ... + +##Visualization +The best way to visualize beta-diversity, or how different samples are from each other, is by non-metric multidimensional scaling (nMDS). This is similar to principle coordinate analysis or PCA/PCoA if you've heard of that, only nMDS is more statistically robust with multiple iterations in the form of the `trymax` part of the command. + +Each symbol on an nMDS plot represents the total microbial community of that sample. Symbols closer together have more similar microbiotas while those farther apart have less similar. + +###OTU-based metrics +There are two main type of beta-diversity measures. These OTU-based metrics treat every OTU as a separate entity without taking taxonomy into account. The distance between *Prevotella* OTU1 and *Prevotella* OTU2 is equivalent to the distance between *Prevotella* OTU1 and *Bacteroides* OTU1. + +####Dot plots +First, we calculate the nMDS values for a 2-axis `k=2` graph using the OTU-based Bray-Curtis metric that takes into account both the presence/absence and abundance of OTUs in your samples (*i.e.* diversity). This uses the `metaMDS` function from the package `vegan`. +```{r} +BC.nmds = metaMDS(OTU.clean, distance="bray", k=2, trymax=1000) +``` + +We see that we reached a convergent solution around 20 iterations and our stress is very low (0.06), meaning that 2-axis are sufficient to view the data. + +Then plot the nMDS with different colors for your different groups of interest. We will use colors for our three ages +```{r} +par(mfrow = c(1, 1)) +#Create a blank plot for the nmds +plot(BC.nmds, type="n", main="Bray-Curtis") +#Add the points colored by age +points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) +#Add a legend +legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +``` + +This will create a plot in the lower right quadrant. If you want to get fancy, type "?plot" in the console to see other ways to modify the plot function. + +A similar thing can be done for the Jaccard metric, which only takes into account presence/absence (*i.e.* richness). +```{r} +J.nmds = metaMDS(OTU.clean, distance="jaccard", k=2, trymax=1000) + +plot(J.nmds, type="n", main="Jaccard") +points(J.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) +legend(-3, 1.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +``` + +You see that the values are very different for Jaccard but the pattern of points is very similar to Bray-Curtis. This is because Jaccard is a transformation of Bray-Curtis with J = 2BC/(1+BC) + +####Ellipses +You can also plot standard error (se) ellipses for your nmds data instead of showing all of the individual points. Here, we will plot 99% confidence se ellipses for the Bray-Curtis metric using `ordiellipse` from `vegan`. + +Code courtesy of Madison Cox. +```{r} +plot(BC.nmds, type="n", main="Bray-Curtis") +legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) + +#Add an ellipse for 2w +ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE) + +#Add an ellipse for 8w +ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE) + +#Add an ellipse for 1yr +ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE) +``` + +We clearly see in both the dot and ellipse plots that age significantly impacts the overall structure (Bray-Curtis) and composition (Jaccard) of the fecal bacterial microbiota. + +####3D plots +If your stress is high (like over 0.3) for your `metaMDS` calculation, you probably need to increase to 3 axes `k=3`. Graphing a 3D plot is much more complicated, and there are a number of packages that could be used. Here, we will use one option from the `plotly` package to visualize a 3D Bray-Curtis plot. + +```{r} +#Calculate the Bray-Curtis nMDS for 3-axis +BC.nmds.3D = metaMDS(OTU.clean, distance="bray", k=3, trymax=1000) +``` + +Extract x-y-z values for this nmds +```{r} +BCxyz = scores(BC.nmds.3D, display="sites") +#This is a table that looks like +BCxyz +``` + +Plot the xyz coordinates and color by age +```{r} +plot_ly(x=BCxyz[,1], y=BCxyz[,2], z=BCxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red")) +``` + +**Note**: Since 3D plots are difficult to interpret in printed journal articles, many authors choose to create two separate 2D plots to show the 3D data like so. +```{r} +par(mfrow=c(1,2)) +#Axis 1 and 2 (x and y) +plot(BCxyz[,1], BCxyz[,2], main="Bray-Curtis 1:2", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) +legend(-5.4, 3, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +#Axis 1 and 3 (x and z) +plot(BCxyz[,1], BCxyz[,3], main="Bray-Curtis 1:3", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) +``` + +###Phylogentic-based metrics +The most common of this type of beta-diversity metrics is UniFrac. The strength of UniFrac over Bray-Curtis or Jaccard is that it takes into account phylogenetic relationships of the species present in the microbiota. Thus, samples with different OTUs from the same genus will be more similar by UniFrac that those with OTUs from different genera. The weakness is that UniFrac is more sensitive to low abundance OTUs and those that a very phylogenetically distant. + +Your choice will depend on how much you personally feel phylogenetic relationships vs. sensitively matter in your data. + +Just as above, UniFrac can be plotted as an nMDS. You just need to use a different R package, and thus, slightly different commands. + +####Create physeq object +To start, you must make a `phyloseq` object which includes the OTU.clean, meta, and tax.clean data. We tell R which tables are each type +```{r nMDS_unifrac} +OTU.UF = otu_table(as.matrix(OTU.clean), taxa_are_rows=FALSE) +tax.UF = tax_table(as.matrix(tax.clean)) +meta.UF = sample_data(meta) +``` + +We then merge these into an object of class phyloseq. +```{r} +physeq = phyloseq(OTU.UF, tax.UF, meta.UF) +``` + +To add the phylogenetic component to UniFrac, we calculate a rooted phylogenetic tree of our OTUs. This takes a long time so we have provided the tree for you. + +However, if we were to calculate a tree, first, we import a distance matrix created from representative sequences of our OTUs. We would use `phangorn` to read the file as it was created in mothur as seen under "Trees of OTUs" [here](https://rpubs.com/dillmcfarlan/mothurSOP). + +**DO NOT RUN THIS** +```{} +dist.mat = import_mothur_dist("clean_repFasta.phylip.dist") +``` + +We would then calculate a rooted neighbor-joining tree from the distance matrix using the `ape` package. + +**DO NOT RUN THIS** +```{} +NJ.tree = bionj(dist.mat) +``` + +Instead, we have pre-calculated this tree and you can load is with +```{r} +load("Data/NJ.tree.Rdata") +``` + +Then, add this tree to your physeq object. This object will be what is used in UniFrac calculations. +```{r} +physeq.tree = merge_phyloseq(physeq, NJ.tree) +``` + +We can look at this object and see its components. +```{r} +physeq.tree +``` + +####Dot plots +Calculate weighted UniFrac (*i.e.* diversity) distances and ordinate into an nMDS. We specify weighted with `weighted=TRUE`. +```{r} +wUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=TRUE) +``` + +You can plot UniFrac nMDS using the basic `plot` function as we've done before. +```{r} +par(mfrow=c(1,1)) +plot(wUF.ordu, type="n", main="Weighted UniFrac") +points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) +legend(0.3,0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +``` + +But let's also look at the `ggplot2` package. This package is incredibly powerful and can be customized in many ways. [This document](https://www.rstudio.com/wp-content/uploads/2016/11/ggplot2-cheatsheet-2.1.pdf) has many helpful tips. +```{r} +plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + + scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + + theme_bw() + + ggtitle("Weighted UniFrac") +``` + +Unweighted UniFrac (*i.e.* richness) can be visualized in the same way. We specify unweighted with `weighted=FALSE`. +```{r} +uwUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=FALSE) + +plot_ordination(physeq.tree, uwUF.ordu, type="sites", color="AgeGroup") + + scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + + theme_bw() + + ggtitle("Unweighted UniFrac") +``` + +####Ellipses +Ellipses can be plotted instead of points as well. With the basic plot function: +```{r} +plot(wUF.ordu, type="n", main="Weighted UniFrac") +legend(0.3, 0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) + +#Add an ellipse for 2w +ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE) + +#Add an ellipse for 8w +ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE) + +#Add an ellipse for 1yr +ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE) +``` + +We can also plot ellipses in `ggplot2`. However, these ellipses are not the exact same at the standard error ellipses used with OTU-based metrics as they use different underlying calculations. However, they get at the same question of confidence intervals for groups of points on an nMDS. + +We plot ellipses with `ggplot2` by adding the `stat_ellipse` function to our plot. +```{r} +plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + + scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + + theme_bw() + + stat_ellipse() + + ggtitle("Weighted UniFrac") +``` + +####3D plots +3D UniFrac ordinations are not currently supported by `phyloseq`. We see that our ordinations only include 2 dimensions. +```{r} +wUF.ordu +``` + +But we can instead calculate UniFrac distances using `UniFrac` and ordinating for 3-axes with `metaMDS`. + +```{r} +wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE) +wUF.nmds.3D = metaMDS(wUF.dist, method="NMDS", k=3) +``` + +Then, similar to what we did with Bray-Curtis/Jaccard, we pull out the xyz values and plot with `plotly`. +```{r} +wUFxyz = scores(wUF.nmds.3D, display="sites") +#This is a table that looks like +wUFxyz + +plot_ly(x=wUFxyz[,1], y=wUFxyz[,2], z=wUFxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red")) +``` + +###Vectors for continuous variables +While it is easy to visualize categorical groups with coloring in nMDS, it is difficult to achieve the same effect with continuous variables. Instead, we can fit these variables as a vector on our nMDS plots. + +To do this, we first fit the variables to our distances using the `envfit` function in `vegan`. You can do Bray-Curtis, Jaccard, weighted or unweighted UniFrac. Here, we will demonstrate with Bray-Curtis and weighted UniFrac. +```{r nMDS_vectors} +fit.BC = envfit(BC.nmds, meta) +fit.BC +``` +We see that it has automatically fit every variable in our meta table. + +The simplest way around this is to just ask envfit to run on only the variables you want. +```{r} +fit.BC = envfit(BC.nmds, meta[,c("AgeGroup", "ADGKG")]) +fit.BC +``` + +We repeat for weighted UniFrac +```{r} +fit.wUF = envfit(wUF.ordu, meta[,c("AgeGroup", "ADGKG")]) +fit.wUF +``` +For categorical variables, envfit will label the centroid of the data for each group in the nMDS with that group's name. For continuous variables, it adds an arrow in the direction from smallest to largest value. + +**Note**: The P-values for variables in `envfit` are not equivalent to the P-values for our ANOVA/Kruskal/GLM tests. Instead, `envfit` P-values tell you how well the arrow or centroids fit the *x-y data of the nMDS*, not the underlying distance matrix. In general, if your nMDS is a good representation of the data (low stress value) and the variable was significant in its appropriate ANOVA/Kruskal/GLM test, the fitted arrow/centroids will also be significant. And if your nMDS is a good representation of the data and the variable was *not* significant, the fitted arrow/centroids will also *not* be significant. We see this type of result here, but this will not always be the case. + +However, if your nMDS stress was borderline or not great and/or your variable was borderline significant or not, you may see divergent results for the arrow/centroid. This does not mean that the result you got in ANOVA/Kruskal/GLM was invalid. It just means that it's difficult to visualize this result as a simple arrow or centroids on a 2D plot. Regardless, non-significant variables in `envfit` that you know are signficant in other tests may still be represented on an nMDS as a visual aid. + +Thus, we plot our 2D nMDS colored by age with an arrow for the ADG variable even though that arrow was not significant. Since the ADG variable was also not significant in GLM, we probably won't use these plot in a publication, but it is good practice. + +For Bray-Curtis: +```{r} +plot(BC.nmds, type="n", main="Bray-Curtis") +points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) +legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +#Add fitted variables +plot(fit.BC, col="black") +``` + +You could also ask it to only plot variables with a fit P-value < 0.05. So we would only see the centroids +```{r} +plot(BC.nmds, type="n", main="Bray-Curtis") +points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) +legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +#Add fitted variables +plot(fit.BC, col="black", p.max=0.05) +``` + +Weighted UniFrac +```{r} +plot(wUF.ordu, type="n", main="Weighted UniFrac") +points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) +legend(.3,.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +#Add fitted variables +plot(fit.wUF, col="black") +``` + +You could also fit your OTU.clean table to the nMDS to add arrow(s) for specific OTUs within the plot. OTU arrows that, say, go in the same direction as an age group centroid tend to increase in abundance in that age group. The opposite direction would indicate that an OTU decreases in abundance in that age group. + +Fitting all OTUs would take awhile so we will only fit the first 10 in our table. +```{r} +fit.BC.OTU = envfit(BC.nmds, OTU.clean[,1:10]) +fit.BC.OTU + +#We will only plot significant arrows in this case +plot(BC.nmds, type="n", main="Bray-Curtis") +points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) +legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +#Add fitted variables +plot(fit.BC.OTU, col="black", p.max=0.05) +``` + +You could also think about plotting higher taxonomic levels like summed genera or family groups of OTUs. +```{r} +#Extract all OTUs within the genus Ruminococcus +OTU.Rumino = OTU.clean[,tax.clean$Genus == "g__Ruminococcus"] +#Sum the abundances of the Ruminococcaceae OTUs into one variable (column) +OTU.Rumino$Rumino.sum = rowSums(OTU.Rumino) + +#Fit the new Ruminococcaceae group +fit.BC.Rumino = envfit(BC.nmds, OTU.Rumino$Rumino.sum) +fit.BC.Rumino + +#Plot +plot(BC.nmds, type="n", main="Bray-Curtis") +points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup]) +legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +#Add fitted variables +plot(fit.BC.Rumino, col="black", labels=c("Ruminococcus")) +``` + +##Statistically test beta-diversity +While nMDS gives us a visual of beta-diversity, it does not test for statistical differences. We do this with permutational analysis of variance (PERMANOVA) or analysis of similarity (ANOSIM). These test whether the overall microbial community differs by your variable of interest. + +You can run them with Bray-Curtis, Jaccard, weighted or unweighted UniFrac to answer different questions. For example, if your variable is significant for Bray-Curtis/weighted UniFrac but not Jaccard/unweighted UniFrac, this means your groups tend to have the same OTUs (richness) but different abundances of those OTUs (diversity). When variables are signficant for Bray-Curtis/Jaccard but not UniFrac, this indicates that your samples have different specific OTUs but similar taxa. Like group 1 has a lot of *Prevotella* OTU1 and group 2 has a lot of *Prevotella* OTU2, but they are both *Prevotella* so UniFrac treats them as being very similar. + +###PERMANOVA +For Bray-Curtis or Jaccard, we use the `vegan` package to calculate distances and run PERMANOVA. As with ANOVA/glm of alpha-diversity, we want to include all variables that could interact in one model. + +**Note**: adonis cannot handle or account for NA or blanks in your data. Subset to only samples with complete metadata before running `vegdist` if these exist. +```{r} +#Calculate distance and save as a matrix +BC.dist=vegdist(OTU.clean, distance="bray") +#Run PERMANOVA on distances. +adonis(BC.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000) +``` + +Similarly for Jaccard +```{r} +J.dist=vegdist(OTU.clean, distance="jaccard") +adonis(J.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000) +``` +We see that the interaction is not significant so we remove it. +```{r} +adonis(BC.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000) +adonis(J.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000) +``` + + +For UniFrac, we use the `phyloseq` package to calculate distances and then `vegan` to run PERMANOVA. +```{r} +wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE) +adonis(wUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000) + +uwUF.dist = UniFrac(physeq.tree, weighted=FALSE, normalized=TRUE) +adonis(uwUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000) +``` +Remove non-significant interaction term +```{r} +adonis(wUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000) +adonis(uwUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000) +``` + +###ANOSIM +If you have very different group sizes, you may consider analysis of similarities (ANOSIM) instead of PERMANOVA. This test does not assume equal group variances. However, it only allows simple 1 variable models with no interactions and can only be used for categorical (AgeGroup), not continuous (ADG) variables. So, ANOSIM has a lot of limitations and should only be used if you group sizes are *very, very* different, like 10 vs 100. + +For example, Bray-Curtis: +```{r} +anosim(BC.dist, meta$AgeGroup, permutations = 1000) +``` + +**Overall, from the nMDS of various beta-diversity metrics (OTU- and phylogenetic-based) and statistical analyses, it is clear that age significantly impacts the fecal microbiota of dairy cows.** + +###2D variables +These analyses are for comparing the microbiota to metadata that cannot fit in a single column and therefore, must be represented as a matrix of its own. For example, PERMANOVA can only tell you that the microbiota differs according to a single short chain fatty acid (SCFA), but other tests can tell you that the microbiota differs according to the overall SCFA profile. This section is also useful for comparing data if you have multiple OTU tables, like for bacteria, archaea, and fungi. + +`Mantel` from `vegan` tests if two distance matrices co-vary *e.g.* does the data in matrix 1 change in the same way as the data in matrix 2. Like PERMANOVA, this test only tells you that the overall data co-vary, not which specific OTUs or SCFAs matter. + +You can only compare samples were you have both types of data so we must subset our OTU table to only the samples that we also have SCFA for. The names are a little different between the tables so we also add ".F" to the SCFA names to make them match +```{r} +OTU.SCFA = OTU.clean[row.names(OTU.clean) %in% paste(row.names(SCFA), ".F", sep=""),] +``` + +We then calculate distance matrices separately for each matrix. It is not necessary to do Bray-Curtis, Jaccard and UniFrac here since our SCFA data does not have any taxonomy to it. +```{r} +dist1 = vegdist(OTU.SCFA) +dist2 = vegdist(SCFA) +``` + +Run a Mantel test comparing the 2 matrices. +```{r} +mantel(dist1, dist2, permutations=100) +``` + +We see that the overall OTU table and SCFA tables do not co-vary. + +You can also run Mantel on 3 matrices at once like so + +**Do not run as we do not have 3 matrices here** +```{} +mantel.partial(dist1, dist2, dist3, permutations=100) +``` + +##Beta dispersion +Sometimes it will be clear from nMDS that one group tends to vary more (be more spread out) than another group. You can test this statistically with multivariate homogeneity of group dispersion (variances). + +Here is an example for Bray-Curtis. We use the same distance matrix we calculated for PERMANOVA/ANOSIM + +Calculate dispersion (variances) within each group. +```{r} +disp.age = betadisper(BC.dist, meta$AgeGroup) +``` + +Perform an ANOVA-like test to determine if the variances differ by groups. +```{r} +permutest(disp.age, pairwise=TRUE, permutations=1000) +``` + +Combining this with our plot, +```{r} +plot(BC.nmds, type="n", main="Bray-Curtis") +legend(.6,-2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20) +ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE) +ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE) +ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE) +``` + +we see that 2 week and 8 week calves have similar variability in their fecal microbiotas but that both 2- and 8-week calves have more variable fecal microbiotas than 1-year heifers. + + +#OTUs that differ by +##Categorical variables +Just because the overall microbiota does or does not differ between age groups, does not mean specific OTUs do or don't differ by age. However, it is inadvisable to just test all OTUs in your data set against all variables of interest. Since you are running multiple similar tests, you need to apply a false discovery rate (fdr) correctios and correcting across all OTUs (5002 in this data set) will most likely result in no significant results after fdr correction. Also, you don't want to look at over 5000 P-values, do you? + +There are a number of way to decrease the number of OTUs you're looking at + +1. Don't use OTUs. Add together genus or family groups and test if all or some of these taxa differ across variables of interest +2. Apply an abundance cutoff such as only looking at OTUs/taxa that are at least 1% abundance in at least one sample +3. Apply a frequency cutoff such as only looking at OTUs/taxa that occur in at least 50% of samples +4. Combine 2 and 3 + +However, some of these methods are somewhat arbitrary. How do you pick an abundance or frequency cutoff? What if a low abundant OTU is of interest? And what if you are interested in possible species-level differences (OTUs) so high taxonomic levels aren't useful? + +So, one way to non-arbitrarily select OTUs/taxa of interest is similarity percentages (SIMPER). SIMPER identifies the OTUs that most contribute to beta-diversity measures. These OTUs are the most abundant and/or most variable OTUs in the data set. **Note**: SIMPER outputs all pairwise comparisons (A-B, B-C, A-C, etc.) and thus, only works for categorical variables. + +SIMPER's output is a list of OTUs which cumulatively explain 70%+ of the variation between each comparison. The numbers below the OTUs are **cumulative**, so to get each OTU's contribution, you must subtract the previous OTU's value. + +For example +```{r ID_OTUs_differ} +simper(OTU.clean, meta$AgeGroup, permutations=100) +``` + +We see a number of OTUs that may differ between 1 or more age comparisons. However, these are just the OTUs that most contribute to Bray-Curtis measures between our age groups. *They are not necessarily significantly different.* + +To test significance, we compare the relative abundance of an OTU across our age groups with Kruskal-Wallis (OTU abundance is never normally distributed, trust me). For example, OTU1 occurs in all SIMPER age comparisons and does, in fact, significantly differ by age. +```{r} +kruskal.test(OTU.clean$Otu00001 ~ meta$AgeGroup) +``` + +In contrast, OTU17 occurs in SIMPER but does not actually significantly differ by age group +```{r} +kruskal.test(OTU.clean$Otu00017 ~ meta$AgeGroup) +``` +**Note**: These P-values have not been corrected from false discovery rate (fdr) yet. + +Now, it would be very tedious to individually test every variable of interest in SIMPER and then test every SIMPER OTU in Kruskal-Wallis. So, Andrew Steinberger (Suen lab) has written two scripts to simplify both SIMPER and Kruskal-Wallis of SIMPER OTUs. The latest versions can be found on his [GitHub page](https://github.com/asteinberger9/seq_scripts) and we have provided them for this workshop in `/Steinberger_scripts` + +**Disclaimer** *Andrew has provided these scripts out of the goodness of his heart and provides no guarentee that they will work for your exact data set or with new versions of R/RStudio/vegan. You may contact him through GitHub with issues or errors, but it is not his job to troubleshoot for you. He may or may not address your concerns in an updated version of the scripts at a later time.* + +The use of these scripts are as follows (from Steinberger GitHub with some modifications) + +**simper_pretty.R** + +This script is meant to rapidly perform the SIMPER function from the R package `vegan` for all comparisons of interest in a data set. Inputs are OTU and metadata tables, and the output is a .csv. User can tailor contents of .csv by setting perc_cutoff, low_cutoff, and low_val. This function can also handle taxonomic levels instead of OTU, but currently only select formats are compatible. Requires installation of the R package 'vegan'. + +Usage: + +simper.pretty(x, metrics, c('interesting'), perc_cutoff=0.5, low_cutoff = 'y', low_val=0.01, 'output_name') + +Inputs: + +* x: OTU table +* metrics: metadata table +* interesting: a list of the column headers for the columns of interest in the metrics file. e.g. c('int1','int2','int3') +* perc_cutoff: % cutoff for output OTUs, as decimal (i.e. write 50% as 0.5), larger % increases number OTUs in output. +* low_cutoff: 'y' if want to REMOVE OTUs that contribute less than 1% +* low_val: set value of low cutoff (0.01), ignored if low_cutoff='n'. +* output_name: the name that is appended to the output filename "_clean_simper.csv". + + +**R_krusk.R** + +This script takes the output .csv of `simper_pretty.R`, and the OTU/metadata/taxonomy tables, and performs the non-parametric Kruskal-Wallis rank-sum test on each OTU in the .csv file. Output is a .csv file containing the same contents of simper.pretty output with the following info: p-value, fdr corrected p-value, OTU taxonomic classification (if applicable), mean rel. abund and std dev of otu/tax_lvl in group 1 of comparison, and mean rel. abund and std dev of otu/tax_lvl in group 2 of comparison. Requires installation of R packages 'vegan' and 'dplyr'. + +Usage: + +kruskal.pretty(x, metrics, csv, c('interesting'), 'output_name', taxonomy) + +Inputs: + +* x: OTU table +* metrics: metadata table +* csv: output from simper.pretty, must be imported as data.frame. e.g. csv= data.frame(read.csv("PATH to name_clean_simper.csv")) +* interesting: a list of the column headers for the columns of interest in the metrics file, should be same as simper.pretty inputs. e.g. c('int1','int2','int3') +* output_name= the name that is appended to the output filename "_krusk_simper.csv". +* taxonomy: The .taxonomy file output from classify.otu command in mothur. This is the UNALTERED tax file, not tax.clean (optional) + +First, we load these functions into R. +```{r} +source("Steinberger_scripts/simper_pretty.r") +source("Steinberger_scripts/R_krusk.r") +``` + +Then, we apply them to our data. We will ask for all SIMPER OTUs (`perc_cutoff = 1`, meaning up to cumulative 100%) but cutoff any OTUs that individually contribute less than 1% to SIMPER (`low_val=0.01`). You may want to consider different cutoffs for your data. +```{r} +simper.pretty(OTU.clean, meta, c('AgeGroup'), perc_cutoff=1, low_cutoff = 'y', low_val=0.01, 'Age') + +simper.results = data.frame(read.csv("Age_clean_simper.csv")) +kruskal.pretty(OTU.clean, meta, simper.results, c('AgeGroup'), 'Age', tax) +``` + +If we import the Kruskal-Wallis back into R and select only OTUs there were significantly different after fdr correction (fdr_krusk_p.val)... +```{r} +#Import +KW.results = data.frame(read.csv("Age_krusk_simper.csv")) +#Remove non-significant +KW.results.signif = KW.results[KW.results$fdr_krusk_p.val < 0.05,] +#Order by OTU# +KW.results.signif = KW.results.signif[with(KW.results.signif, order(OTU)),] +head(KW.results.signif) +``` +we see a number of OTU that significantly differ by age group. + +Looking at OTU1 as relative abundance +```{r} +#Calculate abundance +abund = OTU.clean/rowSums(OTU.clean)*100 +#plot +boxplot(abund$Otu00001 ~ meta$AgeGroup.ord, ylab="% Relative abundance", main="OTU1") +``` + +and using the P-values in KW.results.signif, we can say that OTU1 is significantly less abundant in 1yr animals compared to either 2w or 8w calves. + +##Continuous variables +For continuous variables, there is no simple test like SIMPER to pull out OTUs likely to differ across your variable. You could run linear models `glm` of the OTU abundances with different distributions `family=` similar to what we did with Chao richness. However, OTU abundance data is not normal nor does it fit well with other standard distributions due to its many zeros. So, you will need to test a number of distributions and transformations of the data to find a suitable model. + +##Correlations +So, you can also approach continuous variables as correlations. Generally, only strong correlations (r > 0.5 or r < -0.5) should be reported and if you have a lot that fall into the "strong" category, you can up the cut off, say, to r > 0.75 or r < -0.75. There are many correlation options. I like Kendall-Tau because it does not assume linearity or normality. Type ??cor in the R console to learn others that are available. + +Also, consider options to decrease the number of OTUs tested or you will be dealing with a huge table. Like only ones at >X% abundance? Only ones found in SIMPER and/or KW analyses of other important variables? + +Here, we will correlate ADG to OTUs with at least 5% relative abundance in at least one sample in our data set. +```{r} +#Remember we calculated abundance before with +#abund = OTU.clean/rowSums(OTU.clean)*100 + +#Subset OTUs to abundance cutoff +OTU.abund = OTU.clean[, apply(abund, MARGIN=2, function(x) any(x > 5))] + +cor.kendall = cor(OTU.abund, meta$ADGKG, method = "kendall") +cor.kendall +``` + +In this case, we don't see any strong correlations. However, if we did, we could use those OTUs as our list of ones that are of interest to check for significance with glm. + +Next, we will correlate SCFAs with OTUs with at least 1% relative abundance in at least one sample in our data set. We will use only samples for which we also have SCFA data. +```{r} +#Calculate abundances +abund.SCFA = OTU.SCFA/rowSums(OTU.SCFA)*100 + +#Subset OTUs to abundance cutoff +OTU.SCFA.abund = OTU.SCFA[, apply(abund.SCFA, MARGIN=2, function(x) any(x > 1))] + +cor.kendall = cor(OTU.SCFA.abund, SCFA, method = "kendall") +cor.kendall +``` + +If the data table is too large to view in R, you can write it to a table in your project folder. +```{r} +write.table(cor.kendall, file = "cor_kendall.csv", sep = ",") +``` + +We see that some OTUs strongly correlation with a SCFAs. For example, Otu00021 and Otu00025 with Formate + +```{r} +par(mfrow = c(1, 2)) +plot(abund.SCFA$Otu00021 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU21") +plot(abund.SCFA$Otu00025 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU25") +``` + +Clearly we don't have enough data points to make strong conclusions here and the correlations are being driven by one animal with very high formate. However, we could further test the list of OTUs that correlate strongly with SCFAs. We will assume a normal distribution here, but you should assess your models with plot() to make sure they are a good fit. +```{r} +OTU21.Formate = glm(OTU.SCFA$Otu00021 ~ SCFA$Formate) +summary(OTU21.Formate) + +OTU25.Formate = glm(OTU.SCFA$Otu00025 ~ SCFA$Formate) +summary(OTU25.Formate) +``` + +So, we see that these two OTUs do not significantly differ with Formate concentration even though they had very strong Kendall correlations. This is similar to OTUs occuring in SIMPER that do not hold up to subsequent Kruskal-Wallis testing. + +#Other visualizations +##Bar charts +The phyloseq object we created with our OTU, meta, tax, and tree data (physeq.tree) can also be used in a number of other plot functions in the `phyloseq` / `ggplot2` packages. + +Let's explore some of the bar chart options. First, we'll make the classic additive bar chart for phyla in our samples +```{r Bar_charts} +plot_bar(physeq.tree, fill="Phylum") +``` + +We can simplify by grouping our samples by age group +```{r} +plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") +``` + +And removing the lines between OTUs in the bars +```{r} +plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack") +``` + +And only showing the top 5 most abundant phyla +```{r} +#Sort the Phyla by abundance and pick the top 5 +top5P.names = sort(tapply(taxa_sums(physeq.tree), tax_table(physeq.tree)[, "Phylum"], sum), TRUE)[1:5] +#Cut down the physeq.tree data to only the top 10 Phyla +top5P = subset_taxa(physeq.tree, Phylum %in% names(top5P.names)) +#Plot +plot_bar(top5P, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack") +``` + +There are many more options within `ggplot2` to alter this figure. [This document](https://www.rstudio.com/wp-content/uploads/2016/11/ggplot2-cheatsheet-2.1.pdf) has many helpful tips. + +Another way to simplify these bar plots is to not show all OTUs for one sample in one bar. We can do this with facet_grid +```{r} +plot_bar(top5P, x="AgeGroup", fill="Phylum", facet_grid = ~Phylum) + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack") +``` + +And you can break it down at any taxonomic level and color by any other level. + + +##Trees +We can also plot phylogenetic trees and label/modify them by our variables of interest. + +Let's look at the genus *Prevotella* in our data. We want to subset down to just this genus or else our plot would be too cluttered to read. + +Subset by genus +```{r Trees} +prevotella = subset_taxa(physeq.tree, Genus == "g__Prevotella") +``` + +We can see that this worked by comparing the number of taxa in our subset and our original data +```{r} +physeq.tree +prevotella +``` + +We can plot these OTUs on a tree. +```{r} +plot_tree(prevotella, plot.margin = 0.5, ladderize = TRUE) +``` + +In the figure, each OTU is represented by the end branch of the tree. How many samples that OTU occurs in is represented by the black dots. + +Let's make this figure a little more useful and add 1) Colors to the dots for our age groups, 2) Size to the dots to show OTU abundance, and 3) Species level labels for the OTUs + +```{r} +plot_tree(prevotella, color = "AgeGroup", label.tips = "Species", size = "abundance", plot.margin = 0.5, ladderize = TRUE) +``` + +Already it's a little difficult to read. You can view a larger page by clicking "Zoom" above the figure. Or export the figure as a PDF and save as a full page size, 9.5x11. + +There are even more customizable options in this figure. Type ?plot_tree into the console to see the help page explaining all the options. + +##Heat maps +There are some good options in both `phyloseq` and `gplots` to make heatmaps. We will go through `phyloseq` but know that the same things could be done in `gplots` with code specific to that package. + +###OTUs +We're going to just look at the 20 most abundant OTUs to make it more readable. +```{r Heat_maps} +#Sort the OTUs by abundance and pick the top 20 +top20OTU.names = names(sort(taxa_sums(physeq.tree), TRUE)[1:20]) +#Cut down the physeq.tree data to only the top 10 Phyla +top20OTU = prune_taxa(top20OTU.names, physeq.tree) +``` + +We now see that we only have 20 taxa +```{r} +top20OTU +``` + +First, you can make a heatmap of OTU abundance across all samples +```{r} +plot_heatmap(top20OTU) +``` + +And grouped by our age groups +```{r} +plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup") +``` + +We can label the OTU taxa +```{r} +plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus") +``` + +And group OTUs within the same Phyla +```{r} +plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum") +``` + +We can also change the colors (white -> purple), including the 0s/NAs (grey). +```{r} +plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum", low="white", high="purple", na.value="grey") +``` + +You can also have R automatically group your OTUs and samples by beta-diversity. This may yield the most easily interpreted heatmap but if you have a specific research question that is better addressed by your own ordering (like our age groups above), you should stick with that. We'll show Bray-Curtis as an example. Other options are + +* bray +* jaccard +* wunifrac +* uwunifrac + +```{r} +plot_heatmap(top20OTU, "NMDS", "bray", title="Bray-Curtis") +``` + +###Beta-diversity +The other common use for heatmaps is to show distances between samples (*i.e.* beta-diversity) similar to what is shown in nMDS. We have all of the same metric options as we did for nMDS. + +We do not want to use the plot_heatmap() function from `phyloseq` because it requires the input of a physeq object. Instead, we can use our distance matrices as inputs for a `gplots` command. This command will automatically group samples by similarity (trees) +```{r} +#Bray-Curtis +heatmap.2(as.matrix(BC.dist)) + +#UniFrac +heatmap.2(as.matrix(wUF.dist)) +``` + +You could also change the colors +```{r} +#Rainbow colors +rc <- rainbow(nrow(as.matrix(BC.dist)), start=0, end=0.9) +heatmap.2(as.matrix(BC.dist), col=rc) +``` + +As always, for further customization, explore with ?heatmap.2 + +##Venn diagrams +Venn diagram of three samples: 5017.2w.F, 5017.8w.F, and 5017.1yr.F + +Create a list of OTUs that occur (count > 0) in each sample. + +* We select for the row by name with *OTU.clean["name",]* +* We select the columns with a value >0 with *OTU.clean[,apply()]* + +```{r Venn_diagrams} +OTU.5017.2w = colnames(OTU.clean["5017.2w.F", apply(OTU.clean["5017.2w.F",], MARGIN=2, function(x) any(x >0))]) + +OTU.5017.8w = colnames(OTU.clean["5017.8w.F", apply(OTU.clean["5017.8w.F",], MARGIN=2, function(x) any(x >0))]) + +OTU.5017.1yr = colnames(OTU.clean["5017.1yr.F",apply(OTU.clean["5017.1yr.F",], MARGIN=2, function(x) any(x >0))]) +``` + +We can then use these lists of OTUs to plot a Venn diagram with venn() from the `gplots` package +```{r} +venn(list(OTU.5017.2w, OTU.5017.8w, OTU.5017.1yr)) +``` + +We can also do this for our age groups by selecting all samples where meta$AgeGroup = 2w, 8w, or 1yr +```{r} +OTU.2w = colnames(OTU.clean[meta$AgeGroup == "2w", apply(OTU.clean[meta$AgeGroup == "2w",], MARGIN=2, function(x) any(x >0))]) + +OTU.8w = colnames(OTU.clean[meta$AgeGroup == "8w", apply(OTU.clean[meta$AgeGroup == "8w",], MARGIN=2, function(x) any(x >0))]) + +OTU.1yr = colnames(OTU.clean[meta$AgeGroup == "1yr", apply(OTU.clean[meta$AgeGroup == "1yr",], MARGIN=2, function(x) any(x >0))]) +``` + +And plot +```{r} +venn(list(OTU.2w, OTU.8w, OTU.1yr)) +``` + +These are not the prettiest Venns, but they are the quickest way to calculate the values within a Venn. + +Once you have these, you can use the VennDiagram package for more pretty graphing options. For example, the age groups venns would be +```{r} +draw.triple.venn(area1 = 385+58+71+320, area2 = 801+190+320+71, area3 = 3177+190+58+71, n12 = 320+71, n23 = 190+71, n13 = 58+71, n123 = 71, category = c("2w", "8w", "1yr"), lty = "blank", fill = c("green", "red", "blue")) +``` + +Or we can export the OTU lists and make Venns with this online tool http://bioinformatics.psb.ugent.be/webtools/Venn/. This tool is handy in that is gives you the list of OTUs within the Venn sections so that you can see which specific bacteria are shared. +```{r} +write.table(OTU.2w, "OTU.2w.csv", sep=",", row.names=FALSE, col.names=FALSE) +write.table(OTU.8w, "OTU.8w.csv", sep=",", row.names=FALSE, col.names=FALSE) +write.table(OTU.1yr, "OTU.1yr.csv", sep=",", row.names=FALSE, col.names=FALSE) +``` + +##Networks +###OTUs +You can plot the distances between OTUs as a network. It would be an unreadable mess to plot all the OTUs in our data set, so we will just use the smaller prevotella data set. +```{r} +plot_net(prevotella, color="Species", type="taxa") +``` + +For co-occurrence networks of OTUs, I recommend [Gephi](https://gephi.org/) or [Cytoscape](http://www.cytoscape.org/). Thus far, I have not found an R package comparable to these other programs. + +###Beta-diversity +You can also plot beta-diversity as a network where the edges (lines) are the distances between samples. All metrics we've used here are supported (bray, jaccard, wunifrac, uwunifrac) + +```{r} +plot_net(physeq.tree, color="AgeGroup", distance="bray") +``` + +#Publication figures +Once you have a figure you want to include in a publication, there are a number of ways to export it out of R. You can use the "Export" function within the Plots window, but this often does not result in high enough resolution. + +Ideally, you want to save in PostScript (.ps) or PDF (.pdf) formats because they are vector-based, meaning they are not any specific dpi and do not get blurry when zoomed in. Other formats (PNG, JPG, BMP, TIFF) are pixel-based formats (little square dots) and can become jagged when zoomed in. + +If you have issues getting a specific font to work, try installing and loading the package `extrafont`. + +##PostScript +Here, we will use `postscript` to export as a `.ps`. This function uses + +* width, height: in inches unless otherwise specified with `units=` +* horizontal: TRUE = landscape, FALSE = portrait +* colormodel: RGB, CMYK, and others +* family: Font to be used within figures + +Then we add `layout` if we have more than one plot within the overall figure. + +* matrix: + + A list of how many figures there are. For 2, it is c(1,2). For 4, it is c(1,2,3,4) + + Then the number of rows, columns the figures should be oriented in +* widths: A list of scalars of how large each figure should be in width. +* heights: A list of scalars of how large each figure should be in height. + +```{r} +postscript("Fig1.ps", width = 7, height = 3, horizontal = FALSE, colormodel = "rgb", family = "ArialMT") + +layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1)) + +plot(BC.nmds, type="n", main="Bray-Curtis") +points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) + +boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue")) + +dev.off() +``` + +To open the resulting `.ps` file: + +* Open it directly in Adobe Illustrator (vectors are preserved) +* On a Mac, double-clicking on it will convert it automatically into a PDF and will open automatically into Preview. +* On Windows, it depends on how "file associations" are set-up. Typically the file would need some transformation on a "standard" Windows computer before it can be used. If Adobe software is installed, it could run via Distiller to convert the .ps to a PDF. + +##PDF +To export directly to a PDF, we will use `pdf` + +```{r} +pdf("Fig1.pdf", width = 7, height = 3, colormodel = "rgb", family = "ArialMT") + +layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1)) + +plot(BC.nmds, type="n", main="Bray-Curtis") +points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) + +boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue")) + +dev.off() +``` + +##PNG +PNG is pixel-based so it may get blurry if not at high enough resolution. The exact resolution can be specified by giving the dpi in `res=` + +```{r} +png("Fig1.png", width = 7, height = 3, units='in', res=300) + +layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1)) + +plot(BC.nmds, type="n", main="Bray-Curtis") +points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup]) + +boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue")) + +dev.off() +``` + ![](Fig1.png) \ No newline at end of file diff --git a/Microbiota_analysis_R/Microbiota_Analysis_in_R.html b/Microbiota_analysis_R/Microbiota_Analysis_in_R.html new file mode 100644 index 0000000..b7a15fe --- /dev/null +++ b/Microbiota_analysis_R/Microbiota_Analysis_in_R.html @@ -0,0 +1,2618 @@ + + + + + + + + + + + + + + + +Microbiota Analysis in R + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + +
+
+
+
+
+ +
+ + + + + + + +

Updated December 19, 2017

+

Online version available at http://rpubs.com/dillmcfarlan/R_microbiotaSOP

+
+

Tips for this workshop

+
    +
  1. If you have any issues in R, type ??command into the console where “command” is the function you are having issues with and a help page will come up.
  2. +
  3. Lines starting with # are comments that are for the reader’s benefit. These lines are not code and do not need to be entered into the console.
  4. +
  5. GREY boxes contain code that you can copy and paste to run on your machine.
  6. +
+
#GREY box
+
    +
  1. WHITE boxes contain sample output of this code, and nothing will happen if you try to copy it into your console.

    +
    WHITE box
  2. +
  3. Basic R code you may find useful: +
      +
    1. Matrices/data frames are designated by [ , ] where it is [rows, columns]
    2. +
    3. | is or
    4. +
    5. & is and
    6. +
  4. +
+
+
+

Introduction

+

Written for R v3.3.2 in RStudio v1.0.136

+
+

Goal

+

The goal of this tutorial is to demonstrate basic analyses of microbiota data to determine if and how communities differ by variables of interest. In general, this pipeline can be used for any microbiota data set that has been clustered into operational taxonomic units (OTUs).

+

This tutorial assumes some basic statistical knowledge. Please consider if your data fit the assumptions of each test (normality? equal sampling? Etc.). If you are not familiar with statistics at this level, we strongly recommend collaborating with someone who is. The incorrect use of statistics is a pervasive and serious problem in the sciences so don’t become part of the problem! That said, this is an introductory tutorial and there are many, many further analyses that can be done with microbiota data. Hopefully, this is just the start for your data!

+
+
+

Data

+

The data used here were created using 2x250 bp amplicon sequencing of the bacterial V4 region of the 16S rRNA gene on the Illumina MiSeq platform. The full data set is in Dill-McFarland et al. Sci Rep 7: 40864. Here, we will use a subset of samples. Specifically, we will be correlating the fecal bacterial microbiota of 8 dairy calves at different ages (2 weeks, 8 weeks, 1 year) to variables like weight gain (average daily gain in kg, ADGKG) and gastrointestinal short chain fatty acids (SCFA).

+
+
+

Files

+

We will use the following files created using the Microbiota Processing in mothur: Standard Operating Procedure (SOP).

+
    +
  • example.final.nn.unique_list.0.03.norm.shared (OTU table)
  • +
  • example.final.nn.unique_list.0.03.cons.taxonomy (Taxonomy of OTUs)
  • +
+

We will also be using tab-delimited metadata and SCFA files created in Excel. The metadata includes our metadata (like age and ADGKG) as well as alpha-diversity metrics from example.final.nn.unique_list.0.03.norm.groups.summary calculated in mothur. The SCFA table is the mM concentrations of different SCFAs in rumen (stomach) liquids from 1-year-old animals.

+
    +
  • example.metadata.txt
  • +
  • example.SCFA.txt
  • +
+

Finally, we will be loading a number of custom scripts from Steinberger_scripts and some a pre-calculated OTU tree NJ.tree.RData. The information for creating this tree is provided in this tutorial.

+

All data can be downloaded from GitHub

+
+
+
+

Get set up

+
+

Download and install

+
    +
  • Base R: http://cran.mtu.edu/
  • +
  • RStudio: https://www.rstudio.com/products/rstudio/download3/
  • +
  • Packages: Open RStudio on your computer. If you have not already downloaded these packages, go to the lower right quadrant of your screen and open the Package tab. Click “download” and search for the package you want to download. +
      +
    • ape
    • +
    • dplyr
    • +
    • ggplot2
    • +
    • gplots
    • +
    • lme4
    • +
    • phangorn
    • +
    • plotly
    • +
    • tidyr
    • +
    • vegan
    • +
    • VennDiagram
    • +
    • phyloseq (phyloseq is not on CRAN, so we have to call it manually. See below.)
    • +
  • +
+

Copy and paste the following into your console.

+
source("https://bioconductor.org/biocLite.R")
+
## Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help
+
biocLite("phyloseq")
+
## BioC_mirror: https://bioconductor.org
+
## Using Bioconductor 3.6 (BiocInstaller 1.28.0), R 3.4.3 (2017-11-30).
+
## Installing package(s) 'phyloseq'
+
## 
+## The downloaded binary packages are in
+##  /var/folders/xj/f47n0rmn6gz5rm2jgmqm94fr0000gp/T//Rtmprps3BS/downloaded_packages
+

Note: If you are having trouble installing packages, turn off your computer’s firewall temporarily.

+
+
+

Organization

+

All of our analyses will be organized into a “Project”.

+

Make a new project by selecting File->New project. Select “New Directory” and “Empty Project”. Name the project “Microbiota_Analysis_BRC” and save the project to your Desktop. Place all of your files for this analysis in the folder created on the Desktop

+

Create a new R script (File->New file->R script) to save your code. This file will automatically be saved in the project folder.

+

Now your screen should look like this

+
    +
  • Upper left: Where you type and save the code you want to run.
  • +
  • Upper right: Files you load into and create in R. To view one, click on it and it will open in the upper left pane.
  • +
  • Lower left: The console. Where commands and outputs run (similar to the one mothur window).
  • +
  • Lower right: Variable. Explore the different tabs.
  • +
+
+
+
+

Data manipulation

+
+

Load Packages

+

The “library” command tells R to open the package you want to use. You need to do this every time you open R.

+
#Analyses of Phylogenetics and Evolution package. Required for tree calculations to be used with phyloseq
+library(ape)
+
+#This package will also help us more easily manipulate our data
+library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
#Graphing package used in phyloseq. To edit the default setting of a plot, you need to use functions in this package.
+library(ggplot2)
+
+#This package is used to calculate and plot Venn diagrams as well as heatmaps
+library(gplots)
+
## 
+## Attaching package: 'gplots'
+
## The following object is masked from 'package:stats':
+## 
+##     lowess
+
#Linear mixed-effects models like repeated measures analysis
+library(lme4)
+
## Loading required package: Matrix
+
#used to read in mothur-formatted files
+library(phangorn)
+
+#The phyloseq package seeks to address issues with multiple microbiome analysis packages by providing a set of functions that internally manage the organizing, linking, storing, and analyzing of phylogenetic sequencing data. In general, this package is used for UniFrac analyses.
+library(phyloseq)
+
+#A package to create interactive web graphics of use in 3D plots
+library(plotly)
+
## 
+## Attaching package: 'plotly'
+
## The following object is masked from 'package:ggplot2':
+## 
+##     last_plot
+
## The following object is masked from 'package:stats':
+## 
+##     filter
+
## The following object is masked from 'package:graphics':
+## 
+##     layout
+
#This package will help us more easily manipulate our data, which are matrices
+library(tidyr)
+
## 
+## Attaching package: 'tidyr'
+
## The following object is masked from 'package:Matrix':
+## 
+##     expand
+
#The vegan package provides tools for descriptive community ecology. It has most basic functions of diversity analysis, community ordination and dissimilarity analysis. In general, this package is used for Bray-Curtis and Jaccard analyses.
+library(vegan)
+
## Loading required package: permute
+
## Loading required package: lattice
+
## This is vegan 2.4-5
+
## 
+## Attaching package: 'vegan'
+
## The following objects are masked from 'package:phangorn':
+## 
+##     diversity, treedist
+
#Pretty Venn disgrams
+library(VennDiagram)
+
## Loading required package: grid
+
## Loading required package: futile.logger
+
## 
+## Attaching package: 'VennDiagram'
+
## The following object is masked from 'package:ape':
+## 
+##     rotate
+
+
+

Load Data

+

In the code, the text before = is what the file will be called in R. Make this short but unique as this is how you will tell R to use this file in later commands.

+
    +
  • header: tells R that the first row is column names, not data
  • +
  • row.names: tells R that the first column is row names, not data
  • +
  • sep: tells R that the data are tab-delimited. If you had a comma-delimited file, you would us sep=","
  • +
+
#OTU table (shared file)
+OTU = read.table("Data/example.final.an.unique_list.0.03.norm.shared", header=TRUE, sep="\t")
+
+#Taxonomy of each OTU
+tax = read.table("Data/example.final.an.unique_list.0.03.cons.taxonomy", header=TRUE, sep="\t")
+
+#Metadata. Since we made this in Excel, not mothur, we can use the "row.names" modifier to automatically name the rows by the values in the first column (sample names)
+meta = read.table("Data/example.metadata.txt", header=TRUE, row.names=1, sep="\t")
+
+#SCFA data
+SCFA = read.table("Data/example.SCFA.txt", header=TRUE, row.names=1, sep="\t")
+
+
+

Clean up the data

+

You can look at your data by clicking on it in the upper-right quadrant “Environment”

+

There are several unneeded columns and incorrect formatting in the tables as they were output by mothur. We will now fix them.

+
+

OTU table

+

We need to use the “Group” column as the row names so that it will match our metadata

+
row.names(OTU) = OTU$Group
+

We then need to remove the “label”, “numOTUs”, and “Group” columns as they are not OTU counts like the rest of the table

+
OTU.clean = OTU[,-which(names(OTU) %in% c("label", "numOtus", "Group"))]
+
+
+

Taxonomy table

+

For the taxonomy table, we name the rows by the OTU #

+
row.names(tax) = tax$OTU
+

Remove all the OTUs that don’t occur in our OTU.clean data set

+
tax.clean = tax[row.names(tax) %in% colnames(OTU.clean),]
+

We then need to separate the “taxonomy” column so that each level (i.e. Domain, Phylum, etc) is in it’s own column. We do this with a special command “separate” from the tidyr package

+
tax.clean = separate(tax.clean, Taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain"), sep=";")
+

Finally, we remove the “Size” and “Strain” columns as well as “OTU” since these are now the row names

+
tax.clean = tax.clean[,-which(names(tax.clean) %in% c("Size", "Strain", "OTU"))]
+
+
+

Metadata and SCFA tables

+

These tables do not require any modification since I created them in Excel exactly as I need them for this R analysis.

+
+
+
+

Order the data

+

To make viewing and using the data easier, we will make sure our tables have samples (rows) in the same order. Since OTU.clean, meta, and SCFA have sample names as row names, we order by these.

+
OTU.clean = OTU.clean[order(row.names(OTU.clean)),]
+meta = meta[order(row.names(meta)),]
+SCFA = SCFA[order(row.names(SCFA)),]
+

Our taxonomy table is already in order from OTU1 to OTUN so we do not need to order it.

+
+
+

Set seed

+

We will be running some processes that rely on the random number generater. To make your analysis reproducible, we set the random seed.

+
set.seed(8765)
+
+
+
+

Alpha-diversity

+

Alpha-diversity is within sample diversity. It is how many different species (OTUs) are in each sample (richness) and how evenly they are distributed (evenness), which together are diversity. Each sample has one value for each metric.

+

This image illustrates richness vs. diversity. Both forests have the same richness (4 tree species) but Community 1 has much more even distribution of the 4 species while Community 2 is dominated by tree species A. This makes Community 1 more diverse than Community 2.

+
+

Explore alpha metrics

+

Now we will start to look at our data. We will first start with alpha-diversity and richness. Let’s plot some common ones here.

+
#Create 2x2 plot environment so that we can see all 4 metrics at once. 
+par(mfrow = c(2, 2))
+
+#Then plot each metric.
+hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10)
+hist(meta$simpson, main="Simpson diversity", xlab="", breaks=10)
+hist(meta$chao, main="Chao richness", xlab="", breaks=15)
+hist(meta$ace, main="ACE richness", xlab="", breaks=15)
+

+

You want the data to be roughly normal so that you can run ANOVA or t-tests. If it is not normally distributed, you will need to consider non-parametric tests such as Kruskal-Wallis.

+

Here, we see that none of the data are normally distributed. This occurs with the subset but not the full data set because I’ve specifically selected samples with divergent alpha metrics. In general, you will see roughly normal data for Shannon’s diversity as well as most richness metrics. Simpson’s diversity, on the other hand, is usually skewed as seen here.

+

So most will use inverse Simpson (1/Simpson) instead. This not only increases normalcy but also makes the output more logical as a higher inverse Simpson value corresponds to higher diversity.

+

Let’s look at inverse Simpson instead.

+
#Create 2x2 plot environment 
+par(mfrow = c(2, 2))
+
+#Plots
+hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10)
+hist(1/meta$simpson, main="Inverse Simpson diversity", xlab="", breaks=10)
+hist(meta$chao, main="Chao richness", xlab="", breaks=15)
+hist(meta$ace, main="ACE richness", xlab="", breaks=15)
+

+

Now we see a bimodal distribution for Simpson similar to the richness metrics.

+

To test for normalcy statistically, we can run the Shapiro-Wilk test of normality.

+
shapiro.test(meta$shannon)
+
## 
+##  Shapiro-Wilk normality test
+## 
+## data:  meta$shannon
+## W = 0.91511, p-value = 0.0456
+
shapiro.test(1/meta$simpson)
+
## 
+##  Shapiro-Wilk normality test
+## 
+## data:  1/meta$simpson
+## W = 0.74821, p-value = 4.69e-05
+
shapiro.test(meta$chao)
+
## 
+##  Shapiro-Wilk normality test
+## 
+## data:  meta$chao
+## W = 0.80636, p-value = 0.0003749
+
shapiro.test(meta$ace)
+
## 
+##  Shapiro-Wilk normality test
+## 
+## data:  meta$ace
+## W = 0.83017, p-value = 0.0009573
+

We see that, as expected from the graphs, none are normal.

+

However, our sample size is small and normalcy tests are very sensitive for small data-sets. In fact, you can run Shapiro-Wilk on a list of 50 values randomly sampled from the R-generated normal distribution and find that they are not normal (even though we know that they are!)

+

So, what does this mean for our purposes? Well, we should run statistical tests that don’t assume our data is normal, because we don’t have any evidence (graphs, Shapiro-Wilk) that it is normal. For demonstration purposes, though, we will run other tests as well.

+

Overall, for alpha-diversity:

+
    +
  • ANOVA, t-test, or general linear models with the normal distribution are used when the data is roughly normal
  • +
  • Kruskal-Wallis, Wilcoxon rank sum test, or general linear models with another distribution are used when the data is not normal
  • +
+

Our main variables of interest are

+
    +
  • AgeGroup: 2w, 8w, 1yr
  • +
  • ADGKG: 0.05-1.56 kg gained per day (average daily gain kg)
  • +
+
+
+

Categorical variables

+

Now that we know which tests can be used, let’s run them.

+

Normally distributed metrics

+

Since it’s the closest to normalcy, we will use Shannon’s diversity as an example. First, we will test age, which is a categorical variable with more than 2 levels. Thus, we run ANOVA. If age were only two levels, we could run a t-test

+

Does age impact the Shannon diversity of the fecal microbiota?

+
#Run the ANOVA and save it as an object
+aov.shannon.age = aov(shannon ~ AgeGroup, data=meta)
+#Call for the summary of that ANOVA, which will include P-values
+summary(aov.shannon.age)
+
##             Df Sum Sq Mean Sq F value   Pr(>F)    
+## AgeGroup     2  42.98  21.489   103.4 1.35e-11 ***
+## Residuals   21   4.36   0.208                     
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

To do all the pairwise comparisons between groups and correct for multiple comparisons, we run Tukey’s honest significance test of our ANOVA.

+
TukeyHSD(aov.shannon.age)
+
##   Tukey multiple comparisons of means
+##     95% family-wise confidence level
+## 
+## Fit: aov(formula = shannon ~ AgeGroup, data = meta)
+## 
+## $AgeGroup
+##             diff        lwr       upr   p adj
+## 2w-1yr -3.270063 -3.8446230 -2.695503 0.0e+00
+## 8w-1yr -1.830903 -2.4054628 -1.256342 2.0e-07
+## 8w-2w   1.439160  0.8646001  2.013720 8.5e-06
+

We clearly see that all age groups have significantly different diversity. When we plot the data, we see that diversity increases as the animals age.

+
#Re-order the groups because the default is 1yr-2w-8w
+meta$AgeGroup.ord = factor(meta$AgeGroup, c("2w","8w","1yr"))
+#Return the plot area to 1x1
+par(mfrow = c(1, 1))
+#Plot
+boxplot(shannon ~ AgeGroup.ord, data=meta, ylab="Shannon's diversity")
+

+

Non-normally distributed metrics

+

We will use Chao’s richness estimate here. Since age is categorical, we use Kruskal-Wallis (non-parametric equivalent of ANOVA). If we have only two levels, we would run Wilcoxon rank sum test (non-parametric equivalent of t-test)

+
kruskal.test(chao ~ AgeGroup, data=meta)
+
## 
+##  Kruskal-Wallis rank sum test
+## 
+## data:  chao by AgeGroup
+## Kruskal-Wallis chi-squared = 19.28, df = 2, p-value = 6.507e-05
+

We can test pairwise within the age groups with Wilcoxon Rank Sum Tests. This test has a slightly different syntax than our other tests

+
pairwise.wilcox.test(meta$chao, meta$AgeGroup, p.adjust.method="fdr")
+
## 
+##  Pairwise comparisons using Wilcoxon rank sum test 
+## 
+## data:  meta$chao and meta$AgeGroup 
+## 
+##    1yr     2w     
+## 2w 0.00023 -      
+## 8w 0.00023 0.00186
+## 
+## P value adjustment method: fdr
+

Like diversity, we see that richness also increases with age.

+
#Create 1x1 plot environment
+par(mfrow = c(1, 1))
+#Plot
+boxplot(chao ~ AgeGroup.ord, data=meta, ylab="Chao richness")
+

+
+
+

Continuous variables

+

For continuous variables, we use general linear models, specifying the distribution that best fits our data.

+

Normally distributed metrics

+

Since ADG is a continuous variable, we run a general linear model. We will again use Shannon’s diversity as our roughly normal metric. The default of glm and lm is the normal distribution so we don’t have to specify anything.

+

Does ADG impact the Shannon diversity of the fecal microbiota?

+
glm.shannon.ADG = glm(shannon ~ ADGKG, data=meta)
+summary(glm.shannon.ADG)
+
## 
+## Call:
+## glm(formula = shannon ~ ADGKG, data = meta)
+## 
+## Deviance Residuals: 
+##      Min        1Q    Median        3Q       Max  
+## -2.49110  -1.11216  -0.01749   1.53658   1.84728  
+## 
+## Coefficients:
+##             Estimate Std. Error t value Pr(>|t|)   
+## (Intercept)  3.62565    1.01390   3.576  0.00169 **
+## ADGKG       -0.03407    0.97805  -0.035  0.97253   
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for gaussian family taken to be 2.151815)
+## 
+##     Null deviance: 47.343  on 23  degrees of freedom
+## Residual deviance: 47.340  on 22  degrees of freedom
+## AIC: 90.412
+## 
+## Number of Fisher Scoring iterations: 2
+

The output let’s us know that the intercept of our model is significantly different from 0 but our slope (e.g. our variable of interest) is not. This makes sense when we look at the data.

+
plot(shannon ~ ADGKG, data=meta)
+#Add the glm best fit line
+abline(glm.shannon.ADG)
+

+

Non-normally distributed metrics

+

We will again use a general linear model for our non-normally distributed metric Chao. However, this time, we change the distribution from normal to something that fits the data better.

+

But which distribution should we choose? In statistics, there is no one “best” model. There are only good and better models. We will use the plot() function to compare two models and pick the better one.

+

First, the Gaussian (normal) distribution, which we already know is a bad fit.

+
gaussian.chao.ADG = glm(chao ~ ADGKG, data=meta, family="gaussian")
+par(mfrow = c(1,2))
+plot(gaussian.chao.ADG, which=c(1,2))
+

+

Quasipoisson (log) distribution

+
qp.chao.ADG = glm(chao ~ ADGKG, data=meta, family="quasipoisson")
+par(mfrow = c(1,2))
+plot(qp.chao.ADG, which=c(1,2))
+

+

What we’re looking for is no pattern in the Residuals vs. Fitted graph (“stars in the sky”), which shows that we picked a good distribution family to fit our data. We also want our residuals to be normally distributed, which is shown by most/all of the points falling on the line in the Normal Q-Q plot.

+

While it’s still not perfect, the quasipoisson fits much better with residuals on the order of 30 whereas gaussian was on the order of 600. So, we will use quasipoisson and see that ADG does not to correlate to Chao richness.

+
summary(qp.chao.ADG)
+
## 
+## Call:
+## glm(formula = chao ~ ADGKG, family = "quasipoisson", data = meta)
+## 
+## Deviance Residuals: 
+##    Min      1Q  Median      3Q     Max  
+## -24.36  -17.05  -10.66   18.81   26.91  
+## 
+## Coefficients:
+##             Estimate Std. Error t value Pr(>|t|)    
+## (Intercept)   6.4528     0.5561  11.605 7.54e-11 ***
+## ADGKG        -0.1859     0.5438  -0.342    0.736    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for quasipoisson family taken to be 374.2485)
+## 
+##     Null deviance: 8117.2  on 23  degrees of freedom
+## Residual deviance: 8074.4  on 22  degrees of freedom
+## AIC: NA
+## 
+## Number of Fisher Scoring iterations: 5
+

Plotting this we see that, indeed, there is not signficant correlation between Chao and ADG.

+
#Return the plot area to 1x1
+par(mfrow = c(1, 1))
+#Plot
+plot(log(chao) ~ ADGKG, data=meta, ylab="ln(Chao's richness)")
+abline(qp.chao.ADG)
+

+
+
+

Mixed models

+

Our two variables may not be fully independent and therefore, running them in two separate tests may not be correct. That is to say, age may impact ADG. In fact, I know this is the case because calves (2w, 8w) gain weight more quickly than heifers (1yr).

+

Think about your variables and what they mean “in the real world.” Logically combine them into as few ANOVA tests as possible. In the end, it’s better to test a meaningless interaction than not test a meaningful one.

+

We can test if the interaction of age and ADG impacts diversity with a model that includes both of our variables. The * symbol is a shortcut for models. A*B is equivalent to A + B + A:B

+
aov.shannon.all = aov(shannon ~ AgeGroup*ADGKG, data=meta)
+summary(aov.shannon.all)
+
##                Df Sum Sq Mean Sq F value   Pr(>F)    
+## AgeGroup        2  42.98  21.489  95.472 2.61e-10 ***
+## ADGKG           1   0.05   0.054   0.239    0.631    
+## AgeGroup:ADGKG  2   0.26   0.130   0.576    0.572    
+## Residuals      18   4.05   0.225                     
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

We can see that the interaction of age and ADG doesn’t significantly impact Shannon diversity, So we should remove that variable to simplify our model. If you had many interaction terms, you would step-wise remove the one with the highest P-value until you had the simplest model with only individual variables and significant interaction terms.

+
aov.shannon.all2 = aov(shannon ~ AgeGroup+ADGKG, data=meta)
+summary(aov.shannon.all2)
+
##             Df Sum Sq Mean Sq F value   Pr(>F)    
+## AgeGroup     2  42.98  21.489   99.70 3.96e-11 ***
+## ADGKG        1   0.05   0.054    0.25    0.623    
+## Residuals   20   4.31   0.216                     
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

Overall, the ANOVA test tells us that only age impacts Shannon diversity but it does not tell us which age groups differ from one another. If all of our variables were categorical, we could run TukeyHSD like we did with age only.

+
TukeyHSD(aov.shannon.all)
+
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
+## ADGKG
+
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
+## AgeGroup, ADGKG
+
## Warning in TukeyHSD.aov(aov.shannon.all): 'which' specified some non-
+## factors which will be dropped
+
##   Tukey multiple comparisons of means
+##     95% family-wise confidence level
+## 
+## Fit: aov(formula = shannon ~ AgeGroup * ADGKG, data = meta)
+## 
+## $AgeGroup
+##             diff       lwr       upr    p adj
+## 2w-1yr -3.270063 -3.875469 -2.664657 0.00e+00
+## 8w-1yr -1.830903 -2.436309 -1.225496 1.20e-06
+## 8w-2w   1.439160  0.833754  2.044567 2.81e-05
+

However, you will see that we don’t get any data from ADG since it is continuous. There is an error denoting this as “non-factors ignored: ADGKG”

+

So, we should have run our test as a glm since we have at least one continuous variable. First, we will still include the interaction variable to see that type of output.

+
glm.shannon.all = glm(shannon ~ AgeGroup*ADGKG, data=meta)
+summary(glm.shannon.all)
+
## 
+## Call:
+## glm(formula = shannon ~ AgeGroup * ADGKG, data = meta)
+## 
+## Deviance Residuals: 
+##     Min       1Q   Median       3Q      Max  
+## -1.0301  -0.2468   0.0894   0.1572   0.7624  
+## 
+## Coefficients:
+##                  Estimate Std. Error t value Pr(>|t|)  
+## (Intercept)        5.7123     2.5928   2.203   0.0409 *
+## AgeGroup2w        -3.3969     2.6197  -1.297   0.2111  
+## AgeGroup8w        -2.9610     2.7554  -1.075   0.2967  
+## ADGKG             -0.4481     2.7599  -0.162   0.8728  
+## AgeGroup2w:ADGKG   0.1228     2.7848   0.044   0.9653  
+## AgeGroup8w:ADGKG   1.0750     2.8763   0.374   0.7130  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for gaussian family taken to be 0.22508)
+## 
+##     Null deviance: 47.3425  on 23  degrees of freedom
+## Residual deviance:  4.0514  on 18  degrees of freedom
+## AIC: 39.413
+## 
+## Number of Fisher Scoring iterations: 2
+

Now this output is saying the same thing as ANOVA but in a more complicated way. The function automatically picks a reference group for categorical variables (in this case, 1yr) to compare all other groups to. Let’s go through each line

+
    +
  • (Intercept) - This is whether or not the y-intercept is 0. A significant P-value indicates that the intercept is not 0, and we wouldn’t expect it to be for any alpha-diversity metric since 0 means nothing is there

  • +
  • AgeGroup2w - the difference between Shannon when Age = 2w vs. 1yr (the same as testing “shannon ~ AgeGroup” and only looking at the 2w-1yr pairwise comparison)
  • +
  • AgeGroup8w - the same as 2w but now looking at only the 8w-1yr comparison

  • +
  • ADGKG - the slope of Shannon to ADGKG (the same as testing “shannon ~ ADGKG”)

  • +
  • AgeGroup2w:ADGKG - the difference in slope of shannon ~ ADG between ages 2w and 1yr
  • +
  • AgeGroup8w:ADGKG - the difference in slope of shannon ~ ADG between ages 8w and 1yr

  • +
+

As we saw in ANOVA, none of the interaction terms are significant so we remove them.

+
glm.shannon.all2 = glm(shannon ~ AgeGroup+ADGKG, data=meta)
+summary(glm.shannon.all2)
+
## 
+## Call:
+## glm(formula = shannon ~ AgeGroup + ADGKG, data = meta)
+## 
+## Deviance Residuals: 
+##      Min        1Q    Median        3Q       Max  
+## -0.95299  -0.25858   0.07643   0.30409   0.74487  
+## 
+## Coefficients:
+##             Estimate Std. Error t value Pr(>|t|)    
+## (Intercept)   5.4459     0.3487  15.619 1.14e-12 ***
+## AgeGroup2w   -3.2760     0.2324 -14.094 7.55e-12 ***
+## AgeGroup8w   -1.7989     0.2408  -7.471 3.30e-07 ***
+## ADGKG        -0.1639     0.3281  -0.500    0.623    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for gaussian family taken to be 0.2155447)
+## 
+##     Null deviance: 47.3425  on 23  degrees of freedom
+## Residual deviance:  4.3109  on 20  degrees of freedom
+## AIC: 36.903
+## 
+## Number of Fisher Scoring iterations: 2
+

Note: The full glm model with the interaction term included did not show age as significant. When we remove the interaction term, age is significant. This is why you should remove non-significant interactions terms as they can the mask main effects of individual variables.

+

We can run a similar test with non-normal data like Chao.

+
qp.chao.all = glm(chao ~ AgeGroup*ADGKG, data=meta, family="quasipoisson")
+summary(qp.chao.all)
+
## 
+## Call:
+## glm(formula = chao ~ AgeGroup * ADGKG, family = "quasipoisson", 
+##     data = meta)
+## 
+## Deviance Residuals: 
+##    Min      1Q  Median      3Q     Max  
+## -7.774  -3.430  -0.140   3.692   5.277  
+## 
+## Coefficients:
+##                  Estimate Std. Error t value Pr(>|t|)    
+## (Intercept)       6.99825    0.71122   9.840 1.14e-08 ***
+## AgeGroup2w       -1.61539    0.75272  -2.146   0.0458 *  
+## AgeGroup8w       -2.24498    0.86846  -2.585   0.0187 *  
+## ADGKG             0.01751    0.75699   0.023   0.9818    
+## AgeGroup2w:ADGKG -0.42295    0.80094  -0.528   0.6039    
+## AgeGroup8w:ADGKG  0.86269    0.86550   0.997   0.3321    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for quasipoisson family taken to be 18.86331)
+## 
+##     Null deviance: 8117.2  on 23  degrees of freedom
+## Residual deviance:  348.5  on 18  degrees of freedom
+## AIC: NA
+## 
+## Number of Fisher Scoring iterations: 4
+

Remove the non-significant interaction.

+
qp.chao.all2 = glm(chao ~ AgeGroup+ADGKG, data=meta, family="quasipoisson")
+summary(qp.chao.all2)
+
## 
+## Call:
+## glm(formula = chao ~ AgeGroup + ADGKG, family = "quasipoisson", 
+##     data = meta)
+## 
+## Deviance Residuals: 
+##    Min      1Q  Median      3Q     Max  
+## -7.783  -3.452  -1.378   3.744   8.184  
+## 
+## Coefficients:
+##             Estimate Std. Error t value Pr(>|t|)    
+## (Intercept)  7.03944    0.23567  29.870  < 2e-16 ***
+## AgeGroup2w  -1.98090    0.14862 -13.329 2.08e-11 ***
+## AgeGroup8w  -1.24286    0.11926 -10.422 1.57e-09 ***
+## ADGKG       -0.02643    0.24530  -0.108    0.915    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for quasipoisson family taken to be 23.74583)
+## 
+##     Null deviance: 8117.20  on 23  degrees of freedom
+## Residual deviance:  476.31  on 20  degrees of freedom
+## AIC: NA
+## 
+## Number of Fisher Scoring iterations: 4
+
+
+

Repeated measure

+

Another thing to consider with this data is the fact that we sampled the same animals over time. So, we have a repeated measures design. There are a number of ways to do repeated measures in R. I personally like the lme4 package used here.

+

We add the repeated measure component by adding a random effect for the individual animals with (1|Animal) in the lmer function.

+
rm.shannon.all = lmer(shannon ~ AgeGroup+ADGKG + (1|Animal), data=meta)
+summary(rm.shannon.all)
+
## Linear mixed model fit by REML ['lmerMod']
+## Formula: shannon ~ AgeGroup + ADGKG + (1 | Animal)
+##    Data: meta
+## 
+## REML criterion at convergence: 32.4
+## 
+## Scaled residuals: 
+##      Min       1Q   Median       3Q      Max 
+## -1.83117 -0.45932  0.09539  0.49972  1.53368 
+## 
+## Random effects:
+##  Groups   Name        Variance Std.Dev.
+##  Animal   (Intercept) 0.03793  0.1948  
+##  Residual             0.17819  0.4221  
+## Number of obs: 24, groups:  Animal, 8
+## 
+## Fixed effects:
+##             Estimate Std. Error t value
+## (Intercept)   5.3906     0.3520  15.313
+## AgeGroup2w   -3.2739     0.2114 -15.486
+## AgeGroup8w   -1.8104     0.2208  -8.201
+## ADGKG        -0.1049     0.3321  -0.316
+## 
+## Correlation of Fixed Effects:
+##            (Intr) AgGrp2 AgGrp8
+## AgeGroup2w -0.350              
+## AgeGroup8w -0.027  0.461       
+## ADGKG      -0.884  0.057 -0.293
+

We see that very little of the variance in the data is explained by the animal random effects (0.03793). So we actually don’t need to include repeated measures in our final model, but it was necessary to check!

+

From all of this, we can conclude that the fecal microbiota increases in diversity and richness as dairy cows age. Animal growth as measured by ADG does not correlate with fecal community diversity or richness.

+
+
+
+

Beta-diversity

+

Beta-diversity is between sample diversity. It is how different every sample is from every other sample. Thus, each sample has more than one value. Some metrics take abundance into account (i.e. diversity: Bray-Curtis, weighted UniFrac) and some only calculate based on presence-absence (i.e. richness: Jaccard, unweighted UniFrac).

+

Beta-diversity appears like the following (completely made-up numbers)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
.sample1sample2sample3
sample100.3450.194
sample20.34500.987
sample30.1940.9870
+
+

Visualization

+

The best way to visualize beta-diversity, or how different samples are from each other, is by non-metric multidimensional scaling (nMDS). This is similar to principle coordinate analysis or PCA/PCoA if you’ve heard of that, only nMDS is more statistically robust with multiple iterations in the form of the trymax part of the command.

+

Each symbol on an nMDS plot represents the total microbial community of that sample. Symbols closer together have more similar microbiotas while those farther apart have less similar.

+
+

OTU-based metrics

+

There are two main type of beta-diversity measures. These OTU-based metrics treat every OTU as a separate entity without taking taxonomy into account. The distance between Prevotella OTU1 and Prevotella OTU2 is equivalent to the distance between Prevotella OTU1 and Bacteroides OTU1.

+
+

Dot plots

+

First, we calculate the nMDS values for a 2-axis k=2 graph using the OTU-based Bray-Curtis metric that takes into account both the presence/absence and abundance of OTUs in your samples (i.e. diversity). This uses the metaMDS function from the package vegan.

+
BC.nmds = metaMDS(OTU.clean, distance="bray", k=2, trymax=1000)
+
## Square root transformation
+## Wisconsin double standardization
+## Run 0 stress 0.06208161 
+## Run 1 stress 0.06210668 
+## ... Procrustes: rmse 0.001636313  max resid 0.005662513 
+## ... Similar to previous best
+## Run 2 stress 0.06208261 
+## ... Procrustes: rmse 0.0008174643  max resid 0.00186259 
+## ... Similar to previous best
+## Run 3 stress 0.06208133 
+## ... New best solution
+## ... Procrustes: rmse 0.000495613  max resid 0.001143981 
+## ... Similar to previous best
+## Run 4 stress 0.06208228 
+## ... Procrustes: rmse 0.0002768028  max resid 0.0006083455 
+## ... Similar to previous best
+## Run 5 stress 0.06208254 
+## ... Procrustes: rmse 0.0003377152  max resid 0.0007457908 
+## ... Similar to previous best
+## Run 6 stress 0.06208233 
+## ... Procrustes: rmse 0.000285801  max resid 0.000626649 
+## ... Similar to previous best
+## Run 7 stress 0.06210685 
+## ... Procrustes: rmse 0.001453303  max resid 0.005539077 
+## ... Similar to previous best
+## Run 8 stress 0.062104 
+## ... Procrustes: rmse 0.001430176  max resid 0.005147467 
+## ... Similar to previous best
+## Run 9 stress 0.06208351 
+## ... Procrustes: rmse 0.0005018534  max resid 0.00111944 
+## ... Similar to previous best
+## Run 10 stress 0.06208269 
+## ... Procrustes: rmse 0.0003614257  max resid 0.0008024269 
+## ... Similar to previous best
+## Run 11 stress 0.06208154 
+## ... Procrustes: rmse 0.0004861021  max resid 0.001120926 
+## ... Similar to previous best
+## Run 12 stress 0.06212707 
+## ... Procrustes: rmse 0.001859292  max resid 0.005339963 
+## ... Similar to previous best
+## Run 13 stress 0.3702005 
+## Run 14 stress 0.06210406 
+## ... Procrustes: rmse 0.001425256  max resid 0.00512563 
+## ... Similar to previous best
+## Run 15 stress 0.06208142 
+## ... Procrustes: rmse 3.189023e-05  max resid 6.612762e-05 
+## ... Similar to previous best
+## Run 16 stress 0.06210429 
+## ... Procrustes: rmse 0.001578454  max resid 0.005195898 
+## ... Similar to previous best
+## Run 17 stress 0.06210796 
+## ... Procrustes: rmse 0.00155285  max resid 0.00562623 
+## ... Similar to previous best
+## Run 18 stress 0.06208191 
+## ... Procrustes: rmse 0.0001981339  max resid 0.0004391198 
+## ... Similar to previous best
+## Run 19 stress 0.06208168 
+## ... Procrustes: rmse 0.0001331311  max resid 0.000291077 
+## ... Similar to previous best
+## Run 20 stress 0.06210592 
+## ... Procrustes: rmse 0.001396183  max resid 0.005412384 
+## ... Similar to previous best
+## *** Solution reached
+

We see that we reached a convergent solution around 20 iterations and our stress is very low (0.06), meaning that 2-axis are sufficient to view the data.

+

Then plot the nMDS with different colors for your different groups of interest. We will use colors for our three ages

+
par(mfrow = c(1, 1))
+#Create a blank plot for the nmds
+plot(BC.nmds, type="n", main="Bray-Curtis")
+#Add the points colored by age
+points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
+#Add a legend
+legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+

+

This will create a plot in the lower right quadrant. If you want to get fancy, type “?plot” in the console to see other ways to modify the plot function.

+

A similar thing can be done for the Jaccard metric, which only takes into account presence/absence (i.e. richness).

+
J.nmds = metaMDS(OTU.clean, distance="jaccard", k=2, trymax=1000)
+
## Square root transformation
+## Wisconsin double standardization
+## Run 0 stress 0.0620818 
+## Run 1 stress 0.06208178 
+## ... New best solution
+## ... Procrustes: rmse 0.0007016851  max resid 0.001623036 
+## ... Similar to previous best
+## Run 2 stress 0.06210633 
+## ... Procrustes: rmse 0.001409348  max resid 0.005467011 
+## ... Similar to previous best
+## Run 3 stress 0.06210745 
+## ... Procrustes: rmse 0.001470069  max resid 0.00557513 
+## ... Similar to previous best
+## Run 4 stress 0.06208144 
+## ... New best solution
+## ... Procrustes: rmse 0.0001309513  max resid 0.0002717662 
+## ... Similar to previous best
+## Run 5 stress 0.06208156 
+## ... Procrustes: rmse 5.349512e-05  max resid 0.0001195792 
+## ... Similar to previous best
+## Run 6 stress 0.06208137 
+## ... New best solution
+## ... Procrustes: rmse 2.027381e-05  max resid 4.710602e-05 
+## ... Similar to previous best
+## Run 7 stress 0.06208345 
+## ... Procrustes: rmse 0.0004560942  max resid 0.001010311 
+## ... Similar to previous best
+## Run 8 stress 0.06210681 
+## ... Procrustes: rmse 0.001448074  max resid 0.005531499 
+## ... Similar to previous best
+## Run 9 stress 0.06208334 
+## ... Procrustes: rmse 0.000447034  max resid 0.0009841724 
+## ... Similar to previous best
+## Run 10 stress 0.06208155 
+## ... Procrustes: rmse 7.705878e-05  max resid 0.0001651192 
+## ... Similar to previous best
+## Run 11 stress 0.06208217 
+## ... Procrustes: rmse 0.0002412108  max resid 0.0005340427 
+## ... Similar to previous best
+## Run 12 stress 0.06210429 
+## ... Procrustes: rmse 0.001420012  max resid 0.005133791 
+## ... Similar to previous best
+## Run 13 stress 0.06208263 
+## ... Procrustes: rmse 0.0002884997  max resid 0.0006395557 
+## ... Similar to previous best
+## Run 14 stress 0.06208166 
+## ... Procrustes: rmse 0.0001135875  max resid 0.0002424163 
+## ... Similar to previous best
+## Run 15 stress 0.06210651 
+## ... Procrustes: rmse 0.001438738  max resid 0.005503184 
+## ... Similar to previous best
+## Run 16 stress 0.06208137 
+## ... New best solution
+## ... Procrustes: rmse 6.557907e-05  max resid 0.0001605636 
+## ... Similar to previous best
+## Run 17 stress 0.06208244 
+## ... Procrustes: rmse 0.0002971128  max resid 0.0007158105 
+## ... Similar to previous best
+## Run 18 stress 0.06208222 
+## ... Procrustes: rmse 0.0002613032  max resid 0.000635712 
+## ... Similar to previous best
+## Run 19 stress 0.06208197 
+## ... Procrustes: rmse 0.0002080938  max resid 0.0005677372 
+## ... Similar to previous best
+## Run 20 stress 0.0620832 
+## ... Procrustes: rmse 0.0004183351  max resid 0.0009705139 
+## ... Similar to previous best
+## *** Solution reached
+
plot(J.nmds, type="n", main="Jaccard")
+points(J.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
+legend(-3, 1.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+

+

You see that the values are very different for Jaccard but the pattern of points is very similar to Bray-Curtis. This is because Jaccard is a transformation of Bray-Curtis with J = 2BC/(1+BC)

+
+
+

Ellipses

+

You can also plot standard error (se) ellipses for your nmds data instead of showing all of the individual points. Here, we will plot 99% confidence se ellipses for the Bray-Curtis metric using ordiellipse from vegan.

+

Code courtesy of Madison Cox.

+
plot(BC.nmds, type="n", main="Bray-Curtis")
+legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+
+#Add an ellipse for 2w
+ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
+
+#Add an ellipse for 8w
+ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
+
+#Add an ellipse for 1yr
+ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
+

+

We clearly see in both the dot and ellipse plots that age significantly impacts the overall structure (Bray-Curtis) and composition (Jaccard) of the fecal bacterial microbiota.

+
+
+

3D plots

+

If your stress is high (like over 0.3) for your metaMDS calculation, you probably need to increase to 3 axes k=3. Graphing a 3D plot is much more complicated, and there are a number of packages that could be used. Here, we will use one option from the plotly package to visualize a 3D Bray-Curtis plot.

+
#Calculate the Bray-Curtis nMDS for 3-axis
+BC.nmds.3D = metaMDS(OTU.clean, distance="bray", k=3, trymax=1000)
+
## Square root transformation
+## Wisconsin double standardization
+## Run 0 stress 0.04686346 
+## Run 1 stress 0.04741659 
+## Run 2 stress 0.04673425 
+## ... New best solution
+## ... Procrustes: rmse 0.01073904  max resid 0.0344814 
+## Run 3 stress 0.05061835 
+## Run 4 stress 0.04740131 
+## Run 5 stress 0.04984642 
+## Run 6 stress 0.04747801 
+## Run 7 stress 0.0523384 
+## Run 8 stress 0.05295437 
+## Run 9 stress 0.04741387 
+## Run 10 stress 0.0457586 
+## ... New best solution
+## ... Procrustes: rmse 0.03868237  max resid 0.1296728 
+## Run 11 stress 0.05094992 
+## Run 12 stress 0.04719303 
+## Run 13 stress 0.05012352 
+## Run 14 stress 0.04750204 
+## Run 15 stress 0.0479423 
+## Run 16 stress 0.04579561 
+## ... Procrustes: rmse 0.004692476  max resid 0.01495666 
+## Run 17 stress 0.05069634 
+## Run 18 stress 0.0485804 
+## Run 19 stress 0.05058189 
+## Run 20 stress 0.04859459 
+## Run 21 stress 0.04996713 
+## Run 22 stress 0.04740079 
+## Run 23 stress 0.04747632 
+## Run 24 stress 0.04675455 
+## Run 25 stress 0.04747574 
+## Run 26 stress 0.0486171 
+## Run 27 stress 0.04575823 
+## ... New best solution
+## ... Procrustes: rmse 0.0005374711  max resid 0.0008831403 
+## ... Similar to previous best
+## *** Solution reached
+

Extract x-y-z values for this nmds

+
BCxyz = scores(BC.nmds.3D, display="sites")
+#This is a table that looks like 
+BCxyz
+
##                 NMDS1       NMDS2        NMDS3
+## 5017.1yr.F -4.7973931  0.33029806 -0.211481225
+## 5017.2w.F   3.1867260  0.06208276  1.484970505
+## 5017.8w.F   1.0614871 -2.13025264 -1.218243774
+## 5020.1yr.F -4.7579235  0.24440345 -0.002888360
+## 5020.2w.F   3.4979230 -1.00981047  1.015200903
+## 5020.8w.F   1.5897780 -1.93435391  0.464128291
+## 5026.1yr.F -4.7720517  0.20611823  0.214815994
+## 5026.2w.F   3.3976411  1.10010056 -0.616957559
+## 5026.8w.F   3.1483050  2.07715934  1.478767471
+## 5031.1yr.F -4.8021402  0.44250394  0.202447638
+## 5031.2w.F   3.3537430  0.48376070 -1.490408346
+## 5031.8w.F   0.8577869 -1.64300786  0.250766536
+## 5037.1yr.F -4.8522745  0.48898068 -0.004218580
+## 5037.2w.F   3.6593056  0.26886383 -0.507062657
+## 5037.8w.F   3.1326413 -0.82210579 -0.024946820
+## 5041.1yr.F -4.7724198  0.28335210  0.060469429
+## 5041.2w.F   3.1661815  2.43615798 -1.218459457
+## 5041.8w.F   1.0947996 -2.58325770 -0.236659085
+## 5045.1yr.F -4.7522029  0.16444286  0.004405471
+## 5045.2w.F   1.5110480  3.11956405 -0.469494555
+## 5045.8w.F   1.4900615 -2.17087166 -0.450930039
+## 5053.1yr.F -4.8259682  0.39929033 -0.016428020
+## 5053.2w.F   3.2932453  2.30299477  0.813801957
+## 5053.8w.F   0.8917011 -2.11641360  0.478404284
+

Plot the xyz coordinates and color by age

+
plot_ly(x=BCxyz[,1], y=BCxyz[,2], z=BCxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red"))
+
+ +

Note: Since 3D plots are difficult to interpret in printed journal articles, many authors choose to create two separate 2D plots to show the 3D data like so.

+
par(mfrow=c(1,2))
+#Axis 1 and 2 (x and y)
+plot(BCxyz[,1], BCxyz[,2], main="Bray-Curtis 1:2", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
+legend(-5.4, 3, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+#Axis 1 and 3 (x and z)
+plot(BCxyz[,1], BCxyz[,3], main="Bray-Curtis 1:3", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
+

+
+
+
+

Phylogentic-based metrics

+

The most common of this type of beta-diversity metrics is UniFrac. The strength of UniFrac over Bray-Curtis or Jaccard is that it takes into account phylogenetic relationships of the species present in the microbiota. Thus, samples with different OTUs from the same genus will be more similar by UniFrac that those with OTUs from different genera. The weakness is that UniFrac is more sensitive to low abundance OTUs and those that a very phylogenetically distant.

+

Your choice will depend on how much you personally feel phylogenetic relationships vs. sensitively matter in your data.

+

Just as above, UniFrac can be plotted as an nMDS. You just need to use a different R package, and thus, slightly different commands.

+
+

Create physeq object

+

To start, you must make a phyloseq object which includes the OTU.clean, meta, and tax.clean data. We tell R which tables are each type

+
OTU.UF = otu_table(as.matrix(OTU.clean), taxa_are_rows=FALSE)
+tax.UF = tax_table(as.matrix(tax.clean))
+meta.UF = sample_data(meta)
+

We then merge these into an object of class phyloseq.

+
physeq = phyloseq(OTU.UF, tax.UF, meta.UF)
+

To add the phylogenetic component to UniFrac, we calculate a rooted phylogenetic tree of our OTUs. This takes a long time so we have provided the tree for you.

+

However, if we were to calculate a tree, first, we import a distance matrix created from representative sequences of our OTUs. We would use phangorn to read the file as it was created in mothur as seen under “Trees of OTUs” here.

+

DO NOT RUN THIS

+
dist.mat = import_mothur_dist("clean_repFasta.phylip.dist")
+

We would then calculate a rooted neighbor-joining tree from the distance matrix using the ape package.

+

DO NOT RUN THIS

+
NJ.tree = bionj(dist.mat)
+

Instead, we have pre-calculated this tree and you can load is with

+
load("Data/NJ.tree.Rdata")
+

Then, add this tree to your physeq object. This object will be what is used in UniFrac calculations.

+
physeq.tree = merge_phyloseq(physeq, NJ.tree)
+

We can look at this object and see its components.

+
physeq.tree
+
## phyloseq-class experiment-level object
+## otu_table()   OTU Table:         [ 5002 taxa and 24 samples ]
+## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
+## tax_table()   Taxonomy Table:    [ 5002 taxa by 7 taxonomic ranks ]
+## phy_tree()    Phylogenetic Tree: [ 5002 tips and 5000 internal nodes ]
+
+
+

Dot plots

+

Calculate weighted UniFrac (i.e. diversity) distances and ordinate into an nMDS. We specify weighted with weighted=TRUE.

+
wUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=TRUE)
+
## Warning in UniFrac(physeq, ...): Randomly assigning root as -- Otu00062 --
+## in the phylogenetic tree in the data you provided.
+
## Run 0 stress 0.0864543 
+## Run 1 stress 0.08645377 
+## ... New best solution
+## ... Procrustes: rmse 0.0001213931  max resid 0.0003141587 
+## ... Similar to previous best
+## Run 2 stress 0.1335727 
+## Run 3 stress 0.1463023 
+## Run 4 stress 0.08645329 
+## ... New best solution
+## ... Procrustes: rmse 0.0007206919  max resid 0.001920389 
+## ... Similar to previous best
+## Run 5 stress 0.1270238 
+## Run 6 stress 0.1157455 
+## Run 7 stress 0.1143571 
+## Run 8 stress 0.1317677 
+## Run 9 stress 0.08645345 
+## ... Procrustes: rmse 5.804039e-05  max resid 0.0001620988 
+## ... Similar to previous best
+## Run 10 stress 0.08808605 
+## Run 11 stress 0.08645348 
+## ... Procrustes: rmse 0.000642139  max resid 0.001706552 
+## ... Similar to previous best
+## Run 12 stress 0.1157451 
+## Run 13 stress 0.0864534 
+## ... Procrustes: rmse 4.051435e-05  max resid 0.0001125382 
+## ... Similar to previous best
+## Run 14 stress 0.1143564 
+## Run 15 stress 0.08659435 
+## ... Procrustes: rmse 0.004251655  max resid 0.01804703 
+## Run 16 stress 0.1295296 
+## Run 17 stress 0.0864538 
+## ... Procrustes: rmse 0.000161137  max resid 0.0004585026 
+## ... Similar to previous best
+## Run 18 stress 0.1347981 
+## Run 19 stress 0.08645297 
+## ... New best solution
+## ... Procrustes: rmse 0.0003657154  max resid 0.0008934259 
+## ... Similar to previous best
+## Run 20 stress 0.08808625 
+## *** Solution reached
+

You can plot UniFrac nMDS using the basic plot function as we’ve done before.

+
par(mfrow=c(1,1))
+plot(wUF.ordu, type="n", main="Weighted UniFrac")
+
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
+## Species scores not available
+
points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
+legend(0.3,0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+

+

But let’s also look at the ggplot2 package. This package is incredibly powerful and can be customized in many ways. This document has many helpful tips.

+
plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + 
+  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
+  theme_bw() + 
+  ggtitle("Weighted UniFrac")
+

+

Unweighted UniFrac (i.e. richness) can be visualized in the same way. We specify unweighted with weighted=FALSE.

+
uwUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=FALSE)
+
## Warning in UniFrac(physeq, ...): Randomly assigning root as -- Otu00541 --
+## in the phylogenetic tree in the data you provided.
+
## Run 0 stress 9.987482e-05 
+## Run 1 stress 9.657832e-05 
+## ... New best solution
+## ... Procrustes: rmse 8.116964e-05  max resid 0.0002828867 
+## ... Similar to previous best
+## Run 2 stress 9.871795e-05 
+## ... Procrustes: rmse 8.086551e-05  max resid 0.0002819207 
+## ... Similar to previous best
+## Run 3 stress 9.488633e-05 
+## ... New best solution
+## ... Procrustes: rmse 7.261513e-05  max resid 0.0002642818 
+## ... Similar to previous best
+## Run 4 stress 9.862006e-05 
+## ... Procrustes: rmse 1.701212e-05  max resid 5.025533e-05 
+## ... Similar to previous best
+## Run 5 stress 9.806631e-05 
+## ... Procrustes: rmse 0.0001070474  max resid 0.0002353733 
+## ... Similar to previous best
+## Run 6 stress 9.757454e-05 
+## ... Procrustes: rmse 3.98567e-05  max resid 0.0001388533 
+## ... Similar to previous best
+## Run 7 stress 9.826177e-05 
+## ... Procrustes: rmse 9.722144e-05  max resid 0.0002191938 
+## ... Similar to previous best
+## Run 8 stress 9.695708e-05 
+## ... Procrustes: rmse 7.448698e-05  max resid 0.0002751689 
+## ... Similar to previous best
+## Run 9 stress 9.907648e-05 
+## ... Procrustes: rmse 9.311e-05  max resid 0.000238829 
+## ... Similar to previous best
+## Run 10 stress 9.98514e-05 
+## ... Procrustes: rmse 3.384728e-05  max resid 0.0001260402 
+## ... Similar to previous best
+## Run 11 stress 9.684607e-05 
+## ... Procrustes: rmse 0.0001319038  max resid 0.0003356482 
+## ... Similar to previous best
+## Run 12 stress 9.69891e-05 
+## ... Procrustes: rmse 8.404061e-06  max resid 2.44767e-05 
+## ... Similar to previous best
+## Run 13 stress 0.0002969569 
+## ... Procrustes: rmse 0.0003866362  max resid 0.000671547 
+## ... Similar to previous best
+## Run 14 stress 9.723199e-05 
+## ... Procrustes: rmse 3.73183e-05  max resid 0.0001336345 
+## ... Similar to previous best
+## Run 15 stress 9.99257e-05 
+## ... Procrustes: rmse 0.0001270357  max resid 0.0003614344 
+## ... Similar to previous best
+## Run 16 stress 9.955355e-05 
+## ... Procrustes: rmse 6.05626e-05  max resid 0.000167376 
+## ... Similar to previous best
+## Run 17 stress 9.53228e-05 
+## ... Procrustes: rmse 1.683611e-05  max resid 4.607231e-05 
+## ... Similar to previous best
+## Run 18 stress 9.633493e-05 
+## ... Procrustes: rmse 3.660488e-05  max resid 0.000132421 
+## ... Similar to previous best
+## Run 19 stress 9.921893e-05 
+## ... Procrustes: rmse 1.085923e-05  max resid 1.669451e-05 
+## ... Similar to previous best
+## Run 20 stress 9.637055e-05 
+## ... Procrustes: rmse 6.45069e-05  max resid 0.0001970588 
+## ... Similar to previous best
+## *** Solution reached
+
## Warning in metaMDS(ps.dist): Stress is (nearly) zero - you may have
+## insufficient data
+
plot_ordination(physeq.tree, uwUF.ordu, type="sites", color="AgeGroup") + 
+  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
+  theme_bw() + 
+  ggtitle("Unweighted UniFrac")
+

+
+
+

Ellipses

+

Ellipses can be plotted instead of points as well. With the basic plot function:

+
plot(wUF.ordu, type="n", main="Weighted UniFrac")
+
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
+## Species scores not available
+
legend(0.3, 0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+
+#Add an ellipse for 2w
+ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
+
+#Add an ellipse for 8w
+ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
+
+#Add an ellipse for 1yr
+ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
+

+

We can also plot ellipses in ggplot2. However, these ellipses are not the exact same at the standard error ellipses used with OTU-based metrics as they use different underlying calculations. However, they get at the same question of confidence intervals for groups of points on an nMDS.

+

We plot ellipses with ggplot2 by adding the stat_ellipse function to our plot.

+
plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + 
+  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
+  theme_bw() + 
+  stat_ellipse() + 
+  ggtitle("Weighted UniFrac")
+

+
+
+

3D plots

+

3D UniFrac ordinations are not currently supported by phyloseq. We see that our ordinations only include 2 dimensions.

+
wUF.ordu
+
## 
+## Call:
+## metaMDS(comm = ps.dist) 
+## 
+## global Multidimensional Scaling using monoMDS
+## 
+## Data:     ps.dist 
+## Distance: user supplied 
+## 
+## Dimensions: 2 
+## Stress:     0.08645297 
+## Stress type 1, weak ties
+## Two convergent solutions found after 20 tries
+## Scaling: centring, PC rotation 
+## Species: scores missing
+

But we can instead calculate UniFrac distances using UniFrac and ordinating for 3-axes with metaMDS.

+
wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE)
+
## Warning in UniFrac(physeq.tree, weighted = TRUE, normalized = TRUE):
+## Randomly assigning root as -- Otu03194 -- in the phylogenetic tree in the
+## data you provided.
+
wUF.nmds.3D = metaMDS(wUF.dist, method="NMDS", k=3)
+
## Run 0 stress 0.04217486 
+## Run 1 stress 0.05952615 
+## Run 2 stress 0.05952709 
+## Run 3 stress 0.042174 
+## ... New best solution
+## ... Procrustes: rmse 0.0003317483  max resid 0.0007893038 
+## ... Similar to previous best
+## Run 4 stress 0.04217542 
+## ... Procrustes: rmse 0.0005403913  max resid 0.0014387 
+## ... Similar to previous best
+## Run 5 stress 0.0421741 
+## ... Procrustes: rmse 0.0001810271  max resid 0.000555628 
+## ... Similar to previous best
+## Run 6 stress 0.05952602 
+## Run 7 stress 0.04217451 
+## ... Procrustes: rmse 0.0003976044  max resid 0.001227917 
+## ... Similar to previous best
+## Run 8 stress 0.06815104 
+## Run 9 stress 0.05952564 
+## Run 10 stress 0.04217457 
+## ... Procrustes: rmse 0.0004479109  max resid 0.001435945 
+## ... Similar to previous best
+## Run 11 stress 0.04217428 
+## ... Procrustes: rmse 0.0003207273  max resid 0.0009212836 
+## ... Similar to previous best
+## Run 12 stress 0.04217476 
+## ... Procrustes: rmse 0.0004904995  max resid 0.001357519 
+## ... Similar to previous best
+## Run 13 stress 0.04217443 
+## ... Procrustes: rmse 0.0003308483  max resid 0.0008748533 
+## ... Similar to previous best
+## Run 14 stress 0.04217414 
+## ... Procrustes: rmse 0.0002102509  max resid 0.000611423 
+## ... Similar to previous best
+## Run 15 stress 0.04217491 
+## ... Procrustes: rmse 0.0005257634  max resid 0.001791904 
+## ... Similar to previous best
+## Run 16 stress 0.04217454 
+## ... Procrustes: rmse 0.000398692  max resid 0.001121448 
+## ... Similar to previous best
+## Run 17 stress 0.04217553 
+## ... Procrustes: rmse 0.0004447142  max resid 0.001546131 
+## ... Similar to previous best
+## Run 18 stress 0.04217399 
+## ... New best solution
+## ... Procrustes: rmse 0.0001824097  max resid 0.0005684325 
+## ... Similar to previous best
+## Run 19 stress 0.04217406 
+## ... Procrustes: rmse 7.68744e-05  max resid 0.0001772352 
+## ... Similar to previous best
+## Run 20 stress 0.04217417 
+## ... Procrustes: rmse 0.0001240512  max resid 0.0002862878 
+## ... Similar to previous best
+## *** Solution reached
+

Then, similar to what we did with Bray-Curtis/Jaccard, we pull out the xyz values and plot with plotly.

+
wUFxyz = scores(wUF.nmds.3D, display="sites")
+#This is a table that looks like 
+wUFxyz
+
##                  NMDS1        NMDS2       NMDS3
+## 5017.1yr.F -0.19591424  0.107765310  0.07968290
+## 5017.2w.F   0.40329083  0.187040546 -0.11891085
+## 5017.8w.F  -0.06738145  0.046058811 -0.21927277
+## 5020.1yr.F -0.21311918  0.100813200  0.06833139
+## 5020.2w.F  -0.02918765 -0.163606283 -0.02929884
+## 5020.8w.F   0.03375300  0.054503745 -0.09099989
+## 5026.1yr.F -0.22482781  0.066613100  0.05594134
+## 5026.2w.F   0.13241677 -0.217029557  0.08745439
+## 5026.8w.F   0.38996273  0.135464299  0.24011205
+## 5031.1yr.F -0.19996967  0.080398029  0.09445703
+## 5031.2w.F   0.19084848 -0.256852240  0.01563640
+## 5031.8w.F  -0.13587208 -0.042300350 -0.02591350
+## 5037.1yr.F -0.21800838  0.076413856  0.07189119
+## 5037.2w.F   0.05187202 -0.120151694 -0.04223782
+## 5037.8w.F   0.14227112 -0.115591151 -0.01897721
+## 5041.1yr.F -0.20911338  0.081709200  0.07441520
+## 5041.2w.F   0.27813371 -0.237693762  0.03647625
+## 5041.8w.F  -0.13928666 -0.001531998 -0.18656755
+## 5045.1yr.F -0.23328251  0.051043269  0.06274834
+## 5045.2w.F   0.49259170  0.294540193 -0.14634317
+## 5045.8w.F  -0.16902451 -0.126094687 -0.13841874
+## 5053.1yr.F -0.21539833  0.077884489  0.08008741
+## 5053.2w.F   0.27502987 -0.030380383  0.17559141
+## 5053.8w.F  -0.13978439 -0.049015941 -0.12588496
+
plot_ly(x=wUFxyz[,1], y=wUFxyz[,2], z=wUFxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red"))
+
+ +
+
+
+

Vectors for continuous variables

+

While it is easy to visualize categorical groups with coloring in nMDS, it is difficult to achieve the same effect with continuous variables. Instead, we can fit these variables as a vector on our nMDS plots.

+

To do this, we first fit the variables to our distances using the envfit function in vegan. You can do Bray-Curtis, Jaccard, weighted or unweighted UniFrac. Here, we will demonstrate with Bray-Curtis and weighted UniFrac.

+
fit.BC = envfit(BC.nmds, meta) 
+fit.BC
+
## 
+## ***VECTORS
+## 
+##             NMDS1    NMDS2     r2 Pr(>r)    
+## AgeExact -0.99887 -0.04744 0.9765  0.001 ***
+## ADGKG     0.12503  0.99215 0.0770  0.444    
+## chao     -0.98567  0.16868 0.9599  0.001 ***
+## shannon  -0.69400  0.71997 0.9469  0.001 ***
+## simpson   0.42087 -0.90712 0.7353  0.001 ***
+## ace      -0.99746  0.07129 0.9078  0.001 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## Permutation: free
+## Number of permutations: 999
+## 
+## ***FACTORS:
+## 
+## Centroids:
+##                   NMDS1   NMDS2
+## Animalcow5017   -0.1841  0.5449
+## Animalcow5020    0.0059  0.6577
+## Animalcow5026    0.4243 -0.8826
+## Animalcow5031   -0.2442  0.1175
+## Animalcow5037    0.4946 -0.0566
+## Animalcow5041    0.0500 -0.0290
+## Animalcow5045   -0.1374 -0.3384
+## Animalcow5053   -0.4090 -0.0134
+## AgeGroup1yr     -4.4470 -0.1800
+## AgeGroup2w       2.5047 -1.0509
+## AgeGroup8w       1.9422  1.2309
+## AgeGroup.ord2w   2.5047 -1.0509
+## AgeGroup.ord8w   1.9422  1.2309
+## AgeGroup.ord1yr -4.4470 -0.1800
+## 
+## Goodness of fit:
+##                  r2 Pr(>r)    
+## Animal       0.0248  0.997    
+## AgeGroup     0.9134  0.001 ***
+## AgeGroup.ord 0.9134  0.001 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## Permutation: free
+## Number of permutations: 999
+

We see that it has automatically fit every variable in our meta table.

+

The simplest way around this is to just ask envfit to run on only the variables you want.

+
fit.BC = envfit(BC.nmds, meta[,c("AgeGroup", "ADGKG")])
+fit.BC
+
## 
+## ***VECTORS
+## 
+##         NMDS1   NMDS2    r2 Pr(>r)
+## ADGKG 0.12503 0.99215 0.077  0.452
+## Permutation: free
+## Number of permutations: 999
+## 
+## ***FACTORS:
+## 
+## Centroids:
+##               NMDS1   NMDS2
+## AgeGroup1yr -4.4470 -0.1800
+## AgeGroup2w   2.5047 -1.0509
+## AgeGroup8w   1.9422  1.2309
+## 
+## Goodness of fit:
+##              r2 Pr(>r)    
+## AgeGroup 0.9134  0.001 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## Permutation: free
+## Number of permutations: 999
+

We repeat for weighted UniFrac

+
fit.wUF = envfit(wUF.ordu, meta[,c("AgeGroup", "ADGKG")])
+fit.wUF
+
## 
+## ***VECTORS
+## 
+##          NMDS1    NMDS2     r2 Pr(>r)
+## ADGKG -0.17846  0.98395 0.0398   0.66
+## Permutation: free
+## Number of permutations: 999
+## 
+## ***FACTORS:
+## 
+## Centroids:
+##               NMDS1   NMDS2
+## AgeGroup1yr -0.1076 -0.0834
+## AgeGroup2w   0.1432  0.0322
+## AgeGroup8w  -0.0356  0.0511
+## 
+## Goodness of fit:
+##              r2 Pr(>r)    
+## AgeGroup 0.5588  0.001 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## Permutation: free
+## Number of permutations: 999
+

For categorical variables, envfit will label the centroid of the data for each group in the nMDS with that group’s name. For continuous variables, it adds an arrow in the direction from smallest to largest value.

+

Note: The P-values for variables in envfit are not equivalent to the P-values for our ANOVA/Kruskal/GLM tests. Instead, envfit P-values tell you how well the arrow or centroids fit the x-y data of the nMDS, not the underlying distance matrix. In general, if your nMDS is a good representation of the data (low stress value) and the variable was significant in its appropriate ANOVA/Kruskal/GLM test, the fitted arrow/centroids will also be significant. And if your nMDS is a good representation of the data and the variable was not significant, the fitted arrow/centroids will also not be significant. We see this type of result here, but this will not always be the case.

+

However, if your nMDS stress was borderline or not great and/or your variable was borderline significant or not, you may see divergent results for the arrow/centroid. This does not mean that the result you got in ANOVA/Kruskal/GLM was invalid. It just means that it’s difficult to visualize this result as a simple arrow or centroids on a 2D plot. Regardless, non-significant variables in envfit that you know are signficant in other tests may still be represented on an nMDS as a visual aid.

+

Thus, we plot our 2D nMDS colored by age with an arrow for the ADG variable even though that arrow was not significant. Since the ADG variable was also not significant in GLM, we probably won’t use these plot in a publication, but it is good practice.

+

For Bray-Curtis:

+
plot(BC.nmds, type="n", main="Bray-Curtis")
+points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
+legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+#Add fitted variables
+plot(fit.BC, col="black")
+

+

You could also ask it to only plot variables with a fit P-value < 0.05. So we would only see the centroids

+
plot(BC.nmds, type="n", main="Bray-Curtis")
+points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
+legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+#Add fitted variables
+plot(fit.BC, col="black", p.max=0.05)
+

+

Weighted UniFrac

+
plot(wUF.ordu, type="n", main="Weighted UniFrac")
+
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
+## Species scores not available
+
points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
+legend(.3,.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+#Add fitted variables
+plot(fit.wUF, col="black")
+

+

You could also fit your OTU.clean table to the nMDS to add arrow(s) for specific OTUs within the plot. OTU arrows that, say, go in the same direction as an age group centroid tend to increase in abundance in that age group. The opposite direction would indicate that an OTU decreases in abundance in that age group.

+

Fitting all OTUs would take awhile so we will only fit the first 10 in our table.

+
fit.BC.OTU = envfit(BC.nmds, OTU.clean[,1:10])
+fit.BC.OTU
+
## 
+## ***VECTORS
+## 
+##             NMDS1    NMDS2     r2 Pr(>r)    
+## Otu00001  0.71738 -0.69668 0.2478  0.033 *  
+## Otu00002  0.46984 -0.88275 0.2109  0.057 .  
+## Otu00003  0.25719 -0.96636 0.2503  0.021 *  
+## Otu00004  0.25006  0.96823 0.2738  0.030 *  
+## Otu00005  0.15473  0.98796 0.2910  0.003 ** 
+## Otu00006 -0.96867  0.24837 0.6743  0.001 ***
+## Otu00007  0.17991 -0.98368 0.2488  0.009 ** 
+## Otu00008  0.40157  0.91583 0.3108  0.016 *  
+## Otu00009  0.26275 -0.96487 0.1894  0.062 .  
+## Otu00010  0.33868 -0.94090 0.1552  0.078 .  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## Permutation: free
+## Number of permutations: 999
+
#We will only plot significant arrows in this case
+plot(BC.nmds, type="n", main="Bray-Curtis")
+points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
+legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+#Add fitted variables
+plot(fit.BC.OTU, col="black", p.max=0.05)
+

+

You could also think about plotting higher taxonomic levels like summed genera or family groups of OTUs.

+
#Extract all OTUs within the genus Ruminococcus
+OTU.Rumino = OTU.clean[,tax.clean$Genus == "g__Ruminococcus"]
+#Sum the abundances of the Ruminococcaceae OTUs into one variable (column)
+OTU.Rumino$Rumino.sum = rowSums(OTU.Rumino)
+
+#Fit the new Ruminococcaceae group
+fit.BC.Rumino = envfit(BC.nmds, OTU.Rumino$Rumino.sum)
+fit.BC.Rumino
+
## 
+## ***VECTORS
+## 
+##         NMDS1    NMDS2     r2 Pr(>r)    
+## [1,] -0.14506  0.98942 0.6621  0.001 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## Permutation: free
+## Number of permutations: 999
+
#Plot
+plot(BC.nmds, type="n", main="Bray-Curtis")
+points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
+legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+#Add fitted variables
+plot(fit.BC.Rumino, col="black", labels=c("Ruminococcus"))
+

+
+
+
+

Statistically test beta-diversity

+

While nMDS gives us a visual of beta-diversity, it does not test for statistical differences. We do this with permutational analysis of variance (PERMANOVA) or analysis of similarity (ANOSIM). These test whether the overall microbial community differs by your variable of interest.

+

You can run them with Bray-Curtis, Jaccard, weighted or unweighted UniFrac to answer different questions. For example, if your variable is significant for Bray-Curtis/weighted UniFrac but not Jaccard/unweighted UniFrac, this means your groups tend to have the same OTUs (richness) but different abundances of those OTUs (diversity). When variables are signficant for Bray-Curtis/Jaccard but not UniFrac, this indicates that your samples have different specific OTUs but similar taxa. Like group 1 has a lot of Prevotella OTU1 and group 2 has a lot of Prevotella OTU2, but they are both Prevotella so UniFrac treats them as being very similar.

+
+

PERMANOVA

+

For Bray-Curtis or Jaccard, we use the vegan package to calculate distances and run PERMANOVA. As with ANOVA/glm of alpha-diversity, we want to include all variables that could interact in one model.

+

Note: adonis cannot handle or account for NA or blanks in your data. Subset to only samples with complete metadata before running vegdist if these exist.

+
#Calculate distance and save as a matrix
+BC.dist=vegdist(OTU.clean, distance="bray")
+#Run PERMANOVA on distances.
+adonis(BC.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000)
+
## 
+## Call:
+## adonis(formula = BC.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
+## 
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Terms added sequentially (first to last)
+## 
+##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
+## AgeGroup        2    3.9720 1.98600  8.0116 0.44481 0.000999 ***
+## ADGKG           1    0.1979 0.19791  0.7984 0.02216 0.618382    
+## AgeGroup:ADGKG  2    0.2976 0.14881  0.6003 0.03333 0.929071    
+## Residuals      18    4.4620 0.24789         0.49969             
+## Total          23    8.9296                 1.00000             
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

Similarly for Jaccard

+
J.dist=vegdist(OTU.clean, distance="jaccard")
+adonis(J.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000)
+
## 
+## Call:
+## adonis(formula = J.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
+## 
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Terms added sequentially (first to last)
+## 
+##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
+## AgeGroup        2    3.9720 1.98600  8.0116 0.44481 0.000999 ***
+## ADGKG           1    0.1979 0.19791  0.7984 0.02216 0.632368    
+## AgeGroup:ADGKG  2    0.2976 0.14881  0.6003 0.03333 0.920080    
+## Residuals      18    4.4620 0.24789         0.49969             
+## Total          23    8.9296                 1.00000             
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

We see that the interaction is not significant so we remove it.

+
adonis(BC.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000)
+
## 
+## Call:
+## adonis(formula = BC.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
+## 
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Terms added sequentially (first to last)
+## 
+##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
+## AgeGroup   2    3.9720 1.98600  8.3451 0.44481 0.000999 ***
+## ADGKG      1    0.1979 0.19791  0.8316 0.02216 0.616384    
+## Residuals 20    4.7597 0.23798         0.53302             
+## Total     23    8.9296                 1.00000             
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
adonis(J.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000)
+
## 
+## Call:
+## adonis(formula = J.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
+## 
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Terms added sequentially (first to last)
+## 
+##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
+## AgeGroup   2    3.9720 1.98600  8.3451 0.44481 0.000999 ***
+## ADGKG      1    0.1979 0.19791  0.8316 0.02216 0.566434    
+## Residuals 20    4.7597 0.23798         0.53302             
+## Total     23    8.9296                 1.00000             
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

For UniFrac, we use the phyloseq package to calculate distances and then vegan to run PERMANOVA.

+
wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE)
+
## Warning in UniFrac(physeq.tree, weighted = TRUE, normalized = TRUE):
+## Randomly assigning root as -- Otu00842 -- in the phylogenetic tree in the
+## data you provided.
+
adonis(wUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000)
+
## 
+## Call:
+## adonis(formula = wUF.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
+## 
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Terms added sequentially (first to last)
+## 
+##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
+## AgeGroup        2   0.71682 0.35841  7.6290 0.43422 0.000999 ***
+## ADGKG           1   0.03281 0.03281  0.6984 0.01988 0.665335    
+## AgeGroup:ADGKG  2   0.05553 0.02777  0.5910 0.03364 0.871129    
+## Residuals      18   0.84564 0.04698         0.51226             
+## Total          23   1.65080                 1.00000             
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
uwUF.dist = UniFrac(physeq.tree, weighted=FALSE, normalized=TRUE)
+
## Warning in UniFrac(physeq.tree, weighted = FALSE, normalized = TRUE):
+## Randomly assigning root as -- Otu01729 -- in the phylogenetic tree in the
+## data you provided.
+
adonis(uwUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000)
+
## 
+## Call:
+## adonis(formula = uwUF.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
+## 
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Terms added sequentially (first to last)
+## 
+##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
+## AgeGroup        2    3.4956 1.74781  9.1479 0.46952 0.000999 ***
+## ADGKG           1    0.2434 0.24343  1.2741 0.03270 0.218781    
+## AgeGroup:ADGKG  2    0.2669 0.13344  0.6984 0.03585 0.832168    
+## Residuals      18    3.4391 0.19106         0.46193             
+## Total          23    7.4450                 1.00000             
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

Remove non-significant interaction term

+
adonis(wUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000)
+
## 
+## Call:
+## adonis(formula = wUF.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
+## 
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Terms added sequentially (first to last)
+## 
+##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
+## AgeGroup   2   0.71682 0.35841  7.9543 0.43422 0.000999 ***
+## ADGKG      1   0.03281 0.03281  0.7282 0.01988 0.626374    
+## Residuals 20   0.90117 0.04506         0.54590             
+## Total     23   1.65080                 1.00000             
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
adonis(uwUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000)
+
## 
+## Call:
+## adonis(formula = uwUF.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
+## 
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Terms added sequentially (first to last)
+## 
+##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
+## AgeGroup   2    3.4956 1.74781  9.4324 0.46952 0.000999 ***
+## ADGKG      1    0.2434 0.24343  1.3137 0.03270 0.206793    
+## Residuals 20    3.7060 0.18530         0.49778             
+## Total     23    7.4450                 1.00000             
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+
+

ANOSIM

+

If you have very different group sizes, you may consider analysis of similarities (ANOSIM) instead of PERMANOVA. This test does not assume equal group variances. However, it only allows simple 1 variable models with no interactions and can only be used for categorical (AgeGroup), not continuous (ADG) variables. So, ANOSIM has a lot of limitations and should only be used if you group sizes are very, very different, like 10 vs 100.

+

For example, Bray-Curtis:

+
anosim(BC.dist, meta$AgeGroup, permutations = 1000)
+
## 
+## Call:
+## anosim(dat = BC.dist, grouping = meta$AgeGroup, permutations = 1000) 
+## Dissimilarity: bray 
+## 
+## ANOSIM statistic R: 0.8467 
+##       Significance: 0.000999 
+## 
+## Permutation: free
+## Number of permutations: 1000
+

Overall, from the nMDS of various beta-diversity metrics (OTU- and phylogenetic-based) and statistical analyses, it is clear that age significantly impacts the fecal microbiota of dairy cows.

+
+
+

2D variables

+

These analyses are for comparing the microbiota to metadata that cannot fit in a single column and therefore, must be represented as a matrix of its own. For example, PERMANOVA can only tell you that the microbiota differs according to a single short chain fatty acid (SCFA), but other tests can tell you that the microbiota differs according to the overall SCFA profile. This section is also useful for comparing data if you have multiple OTU tables, like for bacteria, archaea, and fungi.

+

Mantel from vegan tests if two distance matrices co-vary e.g. does the data in matrix 1 change in the same way as the data in matrix 2. Like PERMANOVA, this test only tells you that the overall data co-vary, not which specific OTUs or SCFAs matter.

+

You can only compare samples were you have both types of data so we must subset our OTU table to only the samples that we also have SCFA for. The names are a little different between the tables so we also add “.F” to the SCFA names to make them match

+
OTU.SCFA = OTU.clean[row.names(OTU.clean) %in% paste(row.names(SCFA), ".F", sep=""),]
+

We then calculate distance matrices separately for each matrix. It is not necessary to do Bray-Curtis, Jaccard and UniFrac here since our SCFA data does not have any taxonomy to it.

+
dist1 = vegdist(OTU.SCFA)
+dist2 = vegdist(SCFA)
+

Run a Mantel test comparing the 2 matrices.

+
mantel(dist1, dist2, permutations=100)
+
## 'nperm' >= set of all permutations: complete enumeration.
+
## Set of permutations < 'minperm'. Generating entire set.
+
## 
+## Mantel statistic based on Pearson's product-moment correlation 
+## 
+## Call:
+## mantel(xdis = dist1, ydis = dist2, permutations = 100) 
+## 
+## Mantel statistic r: -0.02423 
+##       Significance: 0.54167 
+## 
+## Upper quantiles of permutations (null model):
+##   90%   95% 97.5%   99% 
+## 0.540 0.552 0.596 0.629 
+## Permutation: free
+## Number of permutations: 23
+

We see that the overall OTU table and SCFA tables do not co-vary.

+

You can also run Mantel on 3 matrices at once like so

+

Do not run as we do not have 3 matrices here

+
mantel.partial(dist1, dist2, dist3, permutations=100)
+
+
+
+

Beta dispersion

+

Sometimes it will be clear from nMDS that one group tends to vary more (be more spread out) than another group. You can test this statistically with multivariate homogeneity of group dispersion (variances).

+

Here is an example for Bray-Curtis. We use the same distance matrix we calculated for PERMANOVA/ANOSIM

+

Calculate dispersion (variances) within each group.

+
disp.age = betadisper(BC.dist, meta$AgeGroup)
+

Perform an ANOVA-like test to determine if the variances differ by groups.

+
permutest(disp.age, pairwise=TRUE, permutations=1000)
+
## 
+## Permutation test for homogeneity of multivariate dispersions
+## Permutation: free
+## Number of permutations: 1000
+## 
+## Response: Distances
+##           Df  Sum Sq  Mean Sq     F N.Perm   Pr(>F)    
+## Groups     2 0.47459 0.237293 30.93   1000 0.000999 ***
+## Residuals 21 0.16111 0.007672                          
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Pairwise comparisons:
+## (Observed p-value below diagonal, permuted p-value above diagonal)
+##            1yr         2w     8w
+## 1yr            9.9900e-04 0.0010
+## 2w  4.8556e-06            0.7902
+## 8w  1.2886e-06 7.7206e-01
+

Combining this with our plot,

+
plot(BC.nmds, type="n", main="Bray-Curtis")
+legend(.6,-2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
+ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
+ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
+ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
+

+

we see that 2 week and 8 week calves have similar variability in their fecal microbiotas but that both 2- and 8-week calves have more variable fecal microbiotas than 1-year heifers.

+
+
+
+

OTUs that differ by

+
+

Categorical variables

+

Just because the overall microbiota does or does not differ between age groups, does not mean specific OTUs do or don’t differ by age. However, it is inadvisable to just test all OTUs in your data set against all variables of interest. Since you are running multiple similar tests, you need to apply a false discovery rate (fdr) correctios and correcting across all OTUs (5002 in this data set) will most likely result in no significant results after fdr correction. Also, you don’t want to look at over 5000 P-values, do you?

+

There are a number of way to decrease the number of OTUs you’re looking at

+
    +
  1. Don’t use OTUs. Add together genus or family groups and test if all or some of these taxa differ across variables of interest
  2. +
  3. Apply an abundance cutoff such as only looking at OTUs/taxa that are at least 1% abundance in at least one sample
  4. +
  5. Apply a frequency cutoff such as only looking at OTUs/taxa that occur in at least 50% of samples
  6. +
  7. Combine 2 and 3
  8. +
+

However, some of these methods are somewhat arbitrary. How do you pick an abundance or frequency cutoff? What if a low abundant OTU is of interest? And what if you are interested in possible species-level differences (OTUs) so high taxonomic levels aren’t useful?

+

So, one way to non-arbitrarily select OTUs/taxa of interest is similarity percentages (SIMPER). SIMPER identifies the OTUs that most contribute to beta-diversity measures. These OTUs are the most abundant and/or most variable OTUs in the data set. Note: SIMPER outputs all pairwise comparisons (A-B, B-C, A-C, etc.) and thus, only works for categorical variables.

+

SIMPER’s output is a list of OTUs which cumulatively explain 70%+ of the variation between each comparison. The numbers below the OTUs are cumulative, so to get each OTU’s contribution, you must subtract the previous OTU’s value.

+

For example

+
simper(OTU.clean, meta$AgeGroup, permutations=100)
+
## cumulative contributions of most influential species:
+## 
+## $`1yr_2w`
+##  Otu00002  Otu00001  Otu00003  Otu00007  Otu00011  Otu00006  Otu00009 
+## 0.0983761 0.1627191 0.2225335 0.2657879 0.2982889 0.3271508 0.3514210 
+##  Otu00014  Otu00022  Otu00018  Otu00012  Otu00016  Otu00004  Otu00021 
+## 0.3660756 0.3793171 0.3924608 0.4048922 0.4171422 0.4283988 0.4385280 
+##  Otu00008  Otu00025  Otu00028  Otu00023  Otu00037  Otu00013  Otu00035 
+## 0.4479076 0.4565849 0.4646081 0.4723795 0.4790690 0.4857141 0.4920793 
+##  Otu00055  Otu00030  Otu00036  Otu00040  Otu00042  Otu00010  Otu00049 
+## 0.4983615 0.5045449 0.5106265 0.5166717 0.5226378 0.5274331 0.5321886 
+##  Otu00046  Otu00033  Otu00031  Otu00081  Otu00051  Otu00064  Otu00056 
+## 0.5368030 0.5413764 0.5458188 0.5500936 0.5543565 0.5582465 0.5620674 
+##  Otu00032  Otu00052  Otu00062  Otu00026  Otu00020  Otu00074  Otu00069 
+## 0.5657989 0.5695078 0.5730822 0.5765920 0.5799406 0.5831741 0.5864067 
+##  Otu00066  Otu00077  Otu00148  Otu00073  Otu00067  Otu00065  Otu00076 
+## 0.5895953 0.5927428 0.5958511 0.5989588 0.6020549 0.6051241 0.6081334 
+##  Otu00075  Otu00091  Otu00048  Otu00097  Otu00068  Otu00050  Otu00084 
+## 0.6111073 0.6140400 0.6169121 0.6196512 0.6223697 0.6250661 0.6277023 
+##  Otu00100  Otu00019  Otu00063  Otu00039  Otu00086  Otu00071  Otu00101 
+## 0.6303356 0.6329664 0.6355752 0.6381709 0.6406744 0.6431362 0.6455850 
+##  Otu00089  Otu00096  Otu00095  Otu00108  Otu00088  Otu00103  Otu00094 
+## 0.6480310 0.6504700 0.6528884 0.6553007 0.6576757 0.6600472 0.6624184 
+##  Otu00098  Otu00116  Otu00090  Otu00105  Otu00104  Otu00099  Otu00059 
+## 0.6647575 0.6670589 0.6693444 0.6716046 0.6738590 0.6760506 0.6781917 
+##  Otu00106  Otu00115  Otu00102  Otu00110  Otu00119  Otu00118  Otu00034 
+## 0.6803196 0.6824245 0.6844633 0.6865021 0.6884972 0.6904775 0.6924261 
+##  Otu00114  Otu00093  Otu00124  Otu00045 
+## 0.6943714 0.6962690 0.6981558 0.7000319 
+## 
+## $`1yr_8w`
+##   Otu00001   Otu00005   Otu00006   Otu00004   Otu00010   Otu00017 
+## 0.03765603 0.07335078 0.10010930 0.12226268 0.14087762 0.15688502 
+##   Otu00008   Otu00009   Otu00015   Otu00018   Otu00016   Otu00014 
+## 0.17205091 0.18718833 0.20107546 0.21456235 0.22713556 0.23964967 
+##   Otu00029   Otu00019   Otu00021   Otu00025   Otu00024   Otu00037 
+## 0.25102468 0.26162658 0.27202671 0.28093293 0.28829315 0.29516652 
+##   Otu00035   Otu00044   Otu00055   Otu00027   Otu00036   Otu00040 
+## 0.30170335 0.30821052 0.31465848 0.32109529 0.32733731 0.33354206 
+##   Otu00042   Otu00020   Otu00013   Otu00041   Otu00003   Otu00043 
+## 0.33966556 0.34564370 0.35158279 0.35717451 0.36261926 0.36799345 
+##   Otu00038   Otu00026   Otu00034   Otu00049   Otu00070   Otu00046 
+## 0.37334038 0.37836130 0.38334135 0.38822230 0.39310161 0.39783775 
+##   Otu00012   Otu00058   Otu00011   Otu00051   Otu00054   Otu00045 
+## 0.40234701 0.40670755 0.41102172 0.41521298 0.41939306 0.42353985 
+##   Otu00047   Otu00064   Otu00056   Otu00052   Otu00048   Otu00002 
+## 0.42764688 0.43163954 0.43556497 0.43937178 0.44313291 0.44683135 
+##   Otu00062   Otu00031   Otu00057   Otu00061   Otu00053   Otu00074 
+## 0.45050368 0.45405112 0.45759807 0.46109474 0.46455875 0.46787762 
+##   Otu00069   Otu00066   Otu00077   Otu00073   Otu00067   Otu00079 
+## 0.47119548 0.47447192 0.47770248 0.48089214 0.48406988 0.48721802 
+##   Otu00083   Otu00078   Otu00076   Otu00075   Otu00091   Otu00121 
+## 0.49033806 0.49342871 0.49651735 0.49956976 0.50257978 0.50549547 
+##   Otu00097   Otu00092   Otu00032   Otu00084   Otu00129   Otu00050 
+## 0.50830678 0.51111612 0.51389884 0.51660098 0.51922111 0.52181856 
+##   Otu00100   Otu00101   Otu00096   Otu00108   Otu00095   Otu00086 
+## 0.52434751 0.52686095 0.52936793 0.53184756 0.53429667 0.53674109 
+##   Otu00089   Otu00088   Otu00103   Otu00094   Otu00098   Otu00116 
+## 0.53918547 0.54162316 0.54405719 0.54649097 0.54889172 0.55125394 
+##   Otu00105   Otu00104   Otu00143   Otu00123   Otu00082   Otu00039 
+## 0.55357747 0.55589135 0.55819397 0.56049152 0.56278380 0.56503978 
+##   Otu00099   Otu00130   Otu00090   Otu00106   Otu00107   Otu00115 
+## 0.56728918 0.56953083 0.57176616 0.57395024 0.57611979 0.57828018 
+##   Otu00087   Otu00153   Otu00102   Otu00110   Otu00119   Otu00118 
+## 0.58042631 0.58252590 0.58461849 0.58671108 0.58875879 0.59079874 
+##   Otu00022   Otu00072   Otu00080   Otu00093   Otu00124   Otu00112 
+## 0.59281824 0.59481609 0.59678509 0.59873275 0.60067308 0.60260107 
+##   Otu00122   Otu00131   Otu00132   Otu00134   Otu00128   Otu00125 
+## 0.60450552 0.60639869 0.60828362 0.61014314 0.61199594 0.61383412 
+##   Otu00133   Otu00159   Otu00139   Otu00127   Otu00114   Otu00137 
+## 0.61566158 0.61747930 0.61928689 0.62106367 0.62282385 0.62455846 
+##   Otu00136   Otu00194   Otu00138   Otu00144   Otu00142   Otu00135 
+## 0.62629042 0.62801571 0.62974033 0.63143945 0.63312281 0.63480281 
+##   Otu00147   Otu00120   Otu00188   Otu00126   Otu00028   Otu00211 
+## 0.63647550 0.63814069 0.63980299 0.64140642 0.64300322 0.64457174 
+##   Otu00154   Otu00146   Otu00173   Otu00156   Otu00158   Otu00157 
+## 0.64612078 0.64764950 0.64917769 0.65068721 0.65217234 0.65364696 
+##   Otu00060   Otu00168   Otu00140   Otu00163   Otu00171   Otu00113 
+## 0.65508066 0.65651008 0.65793253 0.65931862 0.66069801 0.66207484 
+##   Otu00178   Otu00200   Otu00165   Otu00170   Otu00164   Otu00187 
+## 0.66344999 0.66480785 0.66616041 0.66748648 0.66881018 0.67012189 
+##   Otu00151   Otu00213   Otu00149   Otu00183   Otu00192   Otu00167 
+## 0.67141176 0.67269928 0.67397558 0.67525135 0.67652371 0.67778788 
+##   Otu00177   Otu00181   Otu00180   Otu00236   Otu00186   Otu00199 
+## 0.67904574 0.68029263 0.68151160 0.68272731 0.68393783 0.68512983 
+##   Otu00253   Otu00150   Otu00204   Otu00169   Otu00218   Otu00189 
+## 0.68632029 0.68750539 0.68867418 0.68982822 0.69097221 0.69210846 
+##   Otu00182   Otu00184   Otu00226   Otu00270   Otu00172   Otu00225 
+## 0.69323878 0.69436709 0.69548866 0.69660494 0.69770318 0.69878699 
+##   Otu00185   Otu00203 
+## 0.69986670 0.70093653 
+## 
+## $`2w_8w`
+##  Otu00002  Otu00001  Otu00003  Otu00007  Otu00009  Otu00005  Otu00011 
+## 0.1101390 0.1804133 0.2466786 0.2952479 0.3351854 0.3745198 0.4100899 
+##  Otu00004  Otu00010  Otu00017  Otu00008  Otu00012  Otu00015  Otu00022 
+## 0.4397781 0.4641945 0.4818672 0.4987872 0.5154942 0.5307997 0.5454777 
+##  Otu00029  Otu00013  Otu00019  Otu00020  Otu00028  Otu00006  Otu00023 
+## 0.5580145 0.5704325 0.5824230 0.5910912 0.5996473 0.6081657 0.6166261 
+##  Otu00024  Otu00027  Otu00031  Otu00044  Otu00030  Otu00041  Otu00043 
+## 0.6247348 0.6322130 0.6396626 0.6468237 0.6539027 0.6600291 0.6659522 
+##  Otu00038  Otu00032  Otu00026  Otu00070  Otu00033  Otu00034  Otu00047 
+## 0.6718453 0.6776585 0.6834157 0.6887933 0.6940870 0.6992933 0.7044391
+

We see a number of OTUs that may differ between 1 or more age comparisons. However, these are just the OTUs that most contribute to Bray-Curtis measures between our age groups. They are not necessarily significantly different.

+

To test significance, we compare the relative abundance of an OTU across our age groups with Kruskal-Wallis (OTU abundance is never normally distributed, trust me). For example, OTU1 occurs in all SIMPER age comparisons and does, in fact, significantly differ by age.

+
kruskal.test(OTU.clean$Otu00001 ~ meta$AgeGroup)
+
## 
+##  Kruskal-Wallis rank sum test
+## 
+## data:  OTU.clean$Otu00001 by meta$AgeGroup
+## Kruskal-Wallis chi-squared = 15.994, df = 2, p-value = 0.0003364
+

In contrast, OTU17 occurs in SIMPER but does not actually significantly differ by age group

+
kruskal.test(OTU.clean$Otu00017 ~ meta$AgeGroup)
+
## 
+##  Kruskal-Wallis rank sum test
+## 
+## data:  OTU.clean$Otu00017 by meta$AgeGroup
+## Kruskal-Wallis chi-squared = 4.9767, df = 2, p-value = 0.08305
+

Note: These P-values have not been corrected from false discovery rate (fdr) yet.

+

Now, it would be very tedious to individually test every variable of interest in SIMPER and then test every SIMPER OTU in Kruskal-Wallis. So, Andrew Steinberger (Suen lab) has written two scripts to simplify both SIMPER and Kruskal-Wallis of SIMPER OTUs. The latest versions can be found on his GitHub page and we have provided them for this workshop in /Steinberger_scripts

+

Disclaimer Andrew has provided these scripts out of the goodness of his heart and provides no guarentee that they will work for your exact data set or with new versions of R/RStudio/vegan. You may contact him through GitHub with issues or errors, but it is not his job to troubleshoot for you. He may or may not address your concerns in an updated version of the scripts at a later time.

+

The use of these scripts are as follows (from Steinberger GitHub with some modifications)

+

simper_pretty.R

+

This script is meant to rapidly perform the SIMPER function from the R package vegan for all comparisons of interest in a data set. Inputs are OTU and metadata tables, and the output is a .csv. User can tailor contents of .csv by setting perc_cutoff, low_cutoff, and low_val. This function can also handle taxonomic levels instead of OTU, but currently only select formats are compatible. Requires installation of the R package ‘vegan’.

+

Usage:

+

simper.pretty(x, metrics, c(‘interesting’), perc_cutoff=0.5, low_cutoff = ‘y’, low_val=0.01, ‘output_name’)

+

Inputs:

+
    +
  • x: OTU table
  • +
  • metrics: metadata table
  • +
  • interesting: a list of the column headers for the columns of interest in the metrics file. e.g. c(‘int1’,‘int2’,‘int3’)
  • +
  • perc_cutoff: % cutoff for output OTUs, as decimal (i.e. write 50% as 0.5), larger % increases number OTUs in output.
  • +
  • low_cutoff: ‘y’ if want to REMOVE OTUs that contribute less than 1%
  • +
  • low_val: set value of low cutoff (0.01), ignored if low_cutoff=‘n’.
  • +
  • output_name: the name that is appended to the output filename “_clean_simper.csv“.
  • +
+

R_krusk.R

+

This script takes the output .csv of simper_pretty.R, and the OTU/metadata/taxonomy tables, and performs the non-parametric Kruskal-Wallis rank-sum test on each OTU in the .csv file. Output is a .csv file containing the same contents of simper.pretty output with the following info: p-value, fdr corrected p-value, OTU taxonomic classification (if applicable), mean rel. abund and std dev of otu/tax_lvl in group 1 of comparison, and mean rel. abund and std dev of otu/tax_lvl in group 2 of comparison. Requires installation of R packages ‘vegan’ and ‘dplyr’.

+

Usage:

+

kruskal.pretty(x, metrics, csv, c(‘interesting’), ‘output_name’, taxonomy)

+

Inputs:

+
    +
  • x: OTU table
  • +
  • metrics: metadata table
  • +
  • csv: output from simper.pretty, must be imported as data.frame. e.g. csv= data.frame(read.csv(“PATH to name_clean_simper.csv”))
  • +
  • interesting: a list of the column headers for the columns of interest in the metrics file, should be same as simper.pretty inputs. e.g. c(‘int1’,‘int2’,‘int3’)
  • +
  • output_name= the name that is appended to the output filename “_krusk_simper.csv“.
  • +
  • taxonomy: The .taxonomy file output from classify.otu command in mothur. This is the UNALTERED tax file, not tax.clean (optional)
  • +
+

First, we load these functions into R.

+
source("Steinberger_scripts/simper_pretty.r")
+source("Steinberger_scripts/R_krusk.r")
+

Then, we apply them to our data. We will ask for all SIMPER OTUs (perc_cutoff = 1, meaning up to cumulative 100%) but cutoff any OTUs that individually contribute less than 1% to SIMPER (low_val=0.01). You may want to consider different cutoffs for your data.

+
simper.pretty(OTU.clean, meta, c('AgeGroup'), perc_cutoff=1, low_cutoff = 'y', low_val=0.01, 'Age')
+
+simper.results = data.frame(read.csv("Age_clean_simper.csv"))
+kruskal.pretty(OTU.clean, meta, simper.results, c('AgeGroup'), 'Age', tax)
+

If we import the Kruskal-Wallis back into R and select only OTUs there were significantly different after fdr correction (fdr_krusk_p.val)…

+
#Import
+KW.results = data.frame(read.csv("Age_krusk_simper.csv"))
+#Remove non-significant
+KW.results.signif = KW.results[KW.results$fdr_krusk_p.val < 0.05,]
+#Order by OTU#
+KW.results.signif = KW.results.signif[with(KW.results.signif, order(OTU)),]
+head(KW.results.signif)
+
##     X Comparison     SIMPER      OTU  krusk_p.val fdr_krusk_p.val
+## 2   2     1yr_2w 0.06434298 Otu00001 0.0004510953     0.001383359
+## 15 15     1yr_8w 0.03765603 Otu00001 0.0004510953     0.001383359
+## 1   1     1yr_2w 0.09837610 Otu00002 0.0004510953     0.001383359
+## 30 30      2w_8w 0.11013903 Otu00002 0.0208625823     0.029989962
+## 3   3     1yr_2w 0.05981442 Otu00003 0.0003310658     0.001383359
+## 32 32      2w_8w 0.06626526 Otu00003 0.0356919001     0.044373714
+##                                                                                                                   Taxonomy
+## 2          k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
+## 15         k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
+## 1          k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
+## 30         k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
+## 3  k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;
+## 32 k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;
+##    Left.mean.abund   Left.stdev Right.mean.abund Right.stdev
+## 2     7.109140e-06 2.010768e-05      0.128370197  0.16351829
+## 15    7.109140e-06 2.010768e-05      0.073292635  0.09803742
+## 1     7.118451e-06 2.013402e-05      0.196185324  0.23796423
+## 30    1.961853e-01 2.379642e-01      0.007205221  0.01601067
+## 3     0.000000e+00 0.000000e+00      0.119333403  0.18000346
+## 32    1.193334e-01 1.800035e-01      0.010598818  0.02126522
+

we see a number of OTU that significantly differ by age group.

+

Looking at OTU1 as relative abundance

+
#Calculate abundance
+abund = OTU.clean/rowSums(OTU.clean)*100
+#plot
+boxplot(abund$Otu00001 ~ meta$AgeGroup.ord, ylab="% Relative abundance", main="OTU1")
+

+

and using the P-values in KW.results.signif, we can say that OTU1 is significantly less abundant in 1yr animals compared to either 2w or 8w calves.

+
+
+

Continuous variables

+

For continuous variables, there is no simple test like SIMPER to pull out OTUs likely to differ across your variable. You could run linear models glm of the OTU abundances with different distributions family= similar to what we did with Chao richness. However, OTU abundance data is not normal nor does it fit well with other standard distributions due to its many zeros. So, you will need to test a number of distributions and transformations of the data to find a suitable model.

+
+
+

Correlations

+

So, you can also approach continuous variables as correlations. Generally, only strong correlations (r > 0.5 or r < -0.5) should be reported and if you have a lot that fall into the “strong” category, you can up the cut off, say, to r > 0.75 or r < -0.75. There are many correlation options. I like Kendall-Tau because it does not assume linearity or normality. Type ??cor in the R console to learn others that are available.

+

Also, consider options to decrease the number of OTUs tested or you will be dealing with a huge table. Like only ones at >X% abundance? Only ones found in SIMPER and/or KW analyses of other important variables?

+

Here, we will correlate ADG to OTUs with at least 5% relative abundance in at least one sample in our data set.

+
#Remember we calculated abundance before with
+#abund = OTU.clean/rowSums(OTU.clean)*100
+
+#Subset OTUs to abundance cutoff
+OTU.abund = OTU.clean[, apply(abund, MARGIN=2, function(x) any(x > 5))]
+
+cor.kendall = cor(OTU.abund, meta$ADGKG, method = "kendall")
+cor.kendall
+
##                  [,1]
+## Otu00001  0.189852125
+## Otu00002  0.211764129
+## Otu00003  0.027397313
+## Otu00004  0.275867615
+## Otu00005  0.165056323
+## Otu00006 -0.114462240
+## Otu00007  0.143930930
+## Otu00008  0.211764129
+## Otu00009 -0.177517901
+## Otu00010  0.176299258
+## Otu00011  0.208334326
+## Otu00012  0.017236256
+## Otu00013  0.269669049
+## Otu00015  0.018077538
+## Otu00016 -0.257293680
+## Otu00017  0.284293111
+## Otu00019  0.172479145
+## Otu00020  0.102188122
+## Otu00022 -0.034040152
+## Otu00023  0.004106646
+## Otu00024  0.073416202
+## Otu00027  0.412640807
+## Otu00029  0.076924424
+## Otu00030 -0.077670805
+## Otu00031  0.286002668
+## Otu00038 -0.271163072
+## Otu00041  0.125193349
+## Otu00043  0.189645652
+## Otu00044  0.239065695
+## Otu00053 -0.217652255
+## Otu00055 -0.112428004
+## Otu00070 -0.037317590
+

In this case, we don’t see any strong correlations. However, if we did, we could use those OTUs as our list of ones that are of interest to check for significance with glm.

+

Next, we will correlate SCFAs with OTUs with at least 1% relative abundance in at least one sample in our data set. We will use only samples for which we also have SCFA data.

+
#Calculate abundances
+abund.SCFA = OTU.SCFA/rowSums(OTU.SCFA)*100
+
+#Subset OTUs to abundance cutoff
+OTU.SCFA.abund = OTU.SCFA[, apply(abund.SCFA, MARGIN=2, function(x) any(x > 1))]
+
+cor.kendall = cor(OTU.SCFA.abund, SCFA, method = "kendall")
+cor.kendall
+
##             Formate    Acetate Propionate Isobutyrate   Butyrate
+## Otu00006  0.0000000  0.1825742  0.1825742   0.1825742  0.1825742
+## Otu00014  0.1825742  0.3333333  0.3333333   0.0000000  0.3333333
+## Otu00016 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
+## Otu00018 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
+## Otu00021 -0.9128709 -0.6666667 -0.6666667  -0.3333333 -0.6666667
+## Otu00025  0.9128709  0.6666667  0.6666667   0.3333333  0.6666667
+## Otu00035 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
+## Otu00036 -0.5477226 -0.6666667 -0.6666667  -0.3333333 -0.6666667
+## Otu00037 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
+## Otu00040 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
+## Otu00042  0.1825742  0.3333333  0.3333333   0.0000000  0.3333333
+## Otu00046 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
+## Otu00049 -0.1825742 -0.3333333 -0.3333333   0.0000000 -0.3333333
+## Otu00051  0.5477226  0.3333333  0.3333333   0.6666667  0.3333333
+## Otu00052 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
+## Otu00056 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
+## Otu00064 -0.5477226 -0.3333333 -0.3333333  -0.6666667 -0.3333333
+## Otu00066 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
+## Otu00067  0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
+## Otu00069  0.5477226  0.3333333  0.3333333   0.6666667  0.3333333
+## Otu00074  0.5477226  0.6666667  0.6666667   0.3333333  0.6666667
+## Otu00077  0.1825742  0.3333333  0.3333333   0.6666667  0.3333333
+## Otu00088  0.1825742  0.0000000  0.0000000  -0.3333333  0.0000000
+## Otu00089  0.1825742  0.0000000  0.0000000  -0.3333333  0.0000000
+## Otu00097 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
+## Otu00100 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
+## Otu00113 -0.5477226 -0.6666667 -0.6666667  -0.3333333 -0.6666667
+## Otu00192  0.5477226  0.6666667  0.6666667   1.0000000  0.6666667
+## Otu00295  0.2581989  0.2357023  0.2357023   0.7071068  0.2357023
+##            iVal.2MB   Valerate
+## Otu00006 -0.1825742  0.1825742
+## Otu00014 -0.3333333  0.0000000
+## Otu00016 -0.3333333 -0.6666667
+## Otu00018 -0.3333333 -0.6666667
+## Otu00021 -0.6666667 -0.3333333
+## Otu00025  0.6666667  0.3333333
+## Otu00035 -0.6666667 -1.0000000
+## Otu00036  0.0000000 -0.3333333
+## Otu00037  0.0000000  0.3333333
+## Otu00040 -0.6666667 -1.0000000
+## Otu00042 -0.3333333  0.0000000
+## Otu00046 -0.3333333 -0.6666667
+## Otu00049  0.3333333  0.0000000
+## Otu00051  1.0000000  0.6666667
+## Otu00052 -0.6666667 -1.0000000
+## Otu00056 -0.3333333 -0.6666667
+## Otu00064 -1.0000000 -0.6666667
+## Otu00066 -0.6666667 -1.0000000
+## Otu00067  0.6666667  0.3333333
+## Otu00069  1.0000000  0.6666667
+## Otu00074  0.0000000  0.3333333
+## Otu00077  0.3333333  0.6666667
+## Otu00088  0.0000000 -0.3333333
+## Otu00089  0.0000000 -0.3333333
+## Otu00097  0.0000000  0.3333333
+## Otu00100  0.0000000  0.3333333
+## Otu00113  0.0000000 -0.3333333
+## Otu00192  0.6666667  1.0000000
+## Otu00295  0.7071068  0.7071068
+

If the data table is too large to view in R, you can write it to a table in your project folder.

+
write.table(cor.kendall, file = "cor_kendall.csv", sep = ",")
+

We see that some OTUs strongly correlation with a SCFAs. For example, Otu00021 and Otu00025 with Formate

+
par(mfrow = c(1, 2))
+plot(abund.SCFA$Otu00021 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU21")
+plot(abund.SCFA$Otu00025 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU25")
+

+

Clearly we don’t have enough data points to make strong conclusions here and the correlations are being driven by one animal with very high formate. However, we could further test the list of OTUs that correlate strongly with SCFAs. We will assume a normal distribution here, but you should assess your models with plot() to make sure they are a good fit.

+
OTU21.Formate = glm(OTU.SCFA$Otu00021 ~ SCFA$Formate)
+summary(OTU21.Formate)
+
## 
+## Call:
+## glm(formula = OTU.SCFA$Otu00021 ~ SCFA$Formate)
+## 
+## Deviance Residuals: 
+##       1        2        3        4  
+## -56.173   96.253  -46.747    6.668  
+## 
+## Coefficients:
+##              Estimate Std. Error t value Pr(>|t|)  
+## (Intercept)    357.75      51.46   6.952   0.0201 *
+## SCFA$Formate  -540.02     201.13  -2.685   0.1152  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for gaussian family taken to be 7324.907)
+## 
+##     Null deviance: 67454  on 3  degrees of freedom
+## Residual deviance: 14650  on 2  degrees of freedom
+## AIC: 50.175
+## 
+## Number of Fisher Scoring iterations: 2
+
OTU25.Formate = glm(OTU.SCFA$Otu00025 ~ SCFA$Formate)
+summary(OTU25.Formate)
+
## 
+## Call:
+## glm(formula = OTU.SCFA$Otu00025 ~ SCFA$Formate)
+## 
+## Deviance Residuals: 
+##        1         2         3         4  
+##  127.727  -118.783     6.217   -15.162  
+## 
+## Coefficients:
+##              Estimate Std. Error t value Pr(>|t|)  
+## (Intercept)    219.78      74.49   2.951   0.0982 .
+## SCFA$Formate   721.00     291.12   2.477   0.1316  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for gaussian family taken to be 15346.04)
+## 
+##     Null deviance: 124819  on 3  degrees of freedom
+## Residual deviance:  30692  on 2  degrees of freedom
+## AIC: 53.133
+## 
+## Number of Fisher Scoring iterations: 2
+

So, we see that these two OTUs do not significantly differ with Formate concentration even though they had very strong Kendall correlations. This is similar to OTUs occuring in SIMPER that do not hold up to subsequent Kruskal-Wallis testing.

+
+
+
+

Other visualizations

+
+

Bar charts

+

The phyloseq object we created with our OTU, meta, tax, and tree data (physeq.tree) can also be used in a number of other plot functions in the phyloseq / ggplot2 packages.

+

Let’s explore some of the bar chart options. First, we’ll make the classic additive bar chart for phyla in our samples

+
plot_bar(physeq.tree, fill="Phylum")
+

+

We can simplify by grouping our samples by age group

+
plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") 
+

+

And removing the lines between OTUs in the bars

+
plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
+

+

And only showing the top 5 most abundant phyla

+
#Sort the Phyla by abundance and pick the top 5
+top5P.names = sort(tapply(taxa_sums(physeq.tree), tax_table(physeq.tree)[, "Phylum"], sum), TRUE)[1:5]
+#Cut down the physeq.tree data to only the top 10 Phyla
+top5P = subset_taxa(physeq.tree, Phylum %in% names(top5P.names))
+#Plot
+plot_bar(top5P, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
+

+

There are many more options within ggplot2 to alter this figure. This document has many helpful tips.

+

Another way to simplify these bar plots is to not show all OTUs for one sample in one bar. We can do this with facet_grid

+
plot_bar(top5P, x="AgeGroup", fill="Phylum", facet_grid = ~Phylum) + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
+

+

And you can break it down at any taxonomic level and color by any other level.

+
+
+

Trees

+

We can also plot phylogenetic trees and label/modify them by our variables of interest.

+

Let’s look at the genus Prevotella in our data. We want to subset down to just this genus or else our plot would be too cluttered to read.

+

Subset by genus

+
prevotella = subset_taxa(physeq.tree, Genus == "g__Prevotella")
+

We can see that this worked by comparing the number of taxa in our subset and our original data

+
physeq.tree
+
## phyloseq-class experiment-level object
+## otu_table()   OTU Table:         [ 5002 taxa and 24 samples ]
+## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
+## tax_table()   Taxonomy Table:    [ 5002 taxa by 7 taxonomic ranks ]
+## phy_tree()    Phylogenetic Tree: [ 5002 tips and 5000 internal nodes ]
+
prevotella
+
## phyloseq-class experiment-level object
+## otu_table()   OTU Table:         [ 106 taxa and 24 samples ]
+## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
+## tax_table()   Taxonomy Table:    [ 106 taxa by 7 taxonomic ranks ]
+## phy_tree()    Phylogenetic Tree: [ 106 tips and 105 internal nodes ]
+

We can plot these OTUs on a tree.

+
plot_tree(prevotella, plot.margin = 0.5, ladderize = TRUE)
+

+

In the figure, each OTU is represented by the end branch of the tree. How many samples that OTU occurs in is represented by the black dots.

+

Let’s make this figure a little more useful and add 1) Colors to the dots for our age groups, 2) Size to the dots to show OTU abundance, and 3) Species level labels for the OTUs

+
plot_tree(prevotella, color = "AgeGroup", label.tips = "Species", size = "abundance", plot.margin = 0.5, ladderize = TRUE)
+

+

Already it’s a little difficult to read. You can view a larger page by clicking “Zoom” above the figure. Or export the figure as a PDF and save as a full page size, 9.5x11.

+

There are even more customizable options in this figure. Type ?plot_tree into the console to see the help page explaining all the options.

+
+
+

Heat maps

+

There are some good options in both phyloseq and gplots to make heatmaps. We will go through phyloseq but know that the same things could be done in gplots with code specific to that package.

+
+

OTUs

+

We’re going to just look at the 20 most abundant OTUs to make it more readable.

+
#Sort the OTUs by abundance and pick the top 20
+top20OTU.names = names(sort(taxa_sums(physeq.tree), TRUE)[1:20])
+#Cut down the physeq.tree data to only the top 10 Phyla
+top20OTU = prune_taxa(top20OTU.names, physeq.tree)
+

We now see that we only have 20 taxa

+
top20OTU
+
## phyloseq-class experiment-level object
+## otu_table()   OTU Table:         [ 20 taxa and 24 samples ]
+## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
+## tax_table()   Taxonomy Table:    [ 20 taxa by 7 taxonomic ranks ]
+## phy_tree()    Phylogenetic Tree: [ 20 tips and 19 internal nodes ]
+

First, you can make a heatmap of OTU abundance across all samples

+
plot_heatmap(top20OTU)
+
## Warning: Transformation introduced infinite values in discrete y-axis
+

+

And grouped by our age groups

+
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup")
+
## Warning: Transformation introduced infinite values in discrete y-axis
+

+

We can label the OTU taxa

+
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus")
+
## Warning: Transformation introduced infinite values in discrete y-axis
+

+

And group OTUs within the same Phyla

+
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum")
+
## Warning: Transformation introduced infinite values in discrete y-axis
+

+

We can also change the colors (white -> purple), including the 0s/NAs (grey).

+
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum", low="white", high="purple", na.value="grey")
+
## Warning: Transformation introduced infinite values in discrete y-axis
+

+

You can also have R automatically group your OTUs and samples by beta-diversity. This may yield the most easily interpreted heatmap but if you have a specific research question that is better addressed by your own ordering (like our age groups above), you should stick with that. We’ll show Bray-Curtis as an example. Other options are

+
    +
  • bray
  • +
  • jaccard
  • +
  • wunifrac
  • +
  • uwunifrac
  • +
+
plot_heatmap(top20OTU, "NMDS", "bray", title="Bray-Curtis")
+
## Warning: Transformation introduced infinite values in discrete y-axis
+

+
+
+

Beta-diversity

+

The other common use for heatmaps is to show distances between samples (i.e. beta-diversity) similar to what is shown in nMDS. We have all of the same metric options as we did for nMDS.

+

We do not want to use the plot_heatmap() function from phyloseq because it requires the input of a physeq object. Instead, we can use our distance matrices as inputs for a gplots command. This command will automatically group samples by similarity (trees)

+
#Bray-Curtis
+heatmap.2(as.matrix(BC.dist))
+

+
#UniFrac
+heatmap.2(as.matrix(wUF.dist))
+

+

You could also change the colors

+
#Rainbow colors
+rc <- rainbow(nrow(as.matrix(BC.dist)), start=0, end=0.9)
+heatmap.2(as.matrix(BC.dist), col=rc)
+

+

As always, for further customization, explore with ?heatmap.2

+
+
+
+

Venn diagrams

+

Venn diagram of three samples: 5017.2w.F, 5017.8w.F, and 5017.1yr.F

+

Create a list of OTUs that occur (count > 0) in each sample.

+
    +
  • We select for the row by name with OTU.clean[“name”,]
  • +
  • We select the columns with a value >0 with OTU.clean[,apply()]
  • +
+
OTU.5017.2w = colnames(OTU.clean["5017.2w.F", apply(OTU.clean["5017.2w.F",], MARGIN=2, function(x) any(x >0))])
+
+OTU.5017.8w = colnames(OTU.clean["5017.8w.F", apply(OTU.clean["5017.8w.F",], MARGIN=2, function(x) any(x >0))])
+
+OTU.5017.1yr = colnames(OTU.clean["5017.1yr.F",apply(OTU.clean["5017.1yr.F",], MARGIN=2, function(x) any(x >0))])
+

We can then use these lists of OTUs to plot a Venn diagram with venn() from the gplots package

+
venn(list(OTU.5017.2w, OTU.5017.8w, OTU.5017.1yr))
+

+

We can also do this for our age groups by selecting all samples where meta$AgeGroup = 2w, 8w, or 1yr

+
OTU.2w = colnames(OTU.clean[meta$AgeGroup == "2w", apply(OTU.clean[meta$AgeGroup == "2w",], MARGIN=2, function(x) any(x >0))])
+
+OTU.8w = colnames(OTU.clean[meta$AgeGroup == "8w", apply(OTU.clean[meta$AgeGroup == "8w",], MARGIN=2, function(x) any(x >0))])
+
+OTU.1yr = colnames(OTU.clean[meta$AgeGroup == "1yr", apply(OTU.clean[meta$AgeGroup == "1yr",], MARGIN=2, function(x) any(x >0))])
+

And plot

+
venn(list(OTU.2w, OTU.8w, OTU.1yr))
+

+

These are not the prettiest Venns, but they are the quickest way to calculate the values within a Venn.

+

Once you have these, you can use the VennDiagram package for more pretty graphing options. For example, the age groups venns would be

+
draw.triple.venn(area1 = 385+58+71+320, area2 = 801+190+320+71, area3 = 3177+190+58+71, n12 = 320+71, n23 = 190+71, n13 = 58+71, n123 = 71, category = c("2w", "8w", "1yr"), lty = "blank", fill = c("green", "red", "blue"))
+

+
## (polygon[GRID.polygon.1343], polygon[GRID.polygon.1344], polygon[GRID.polygon.1345], polygon[GRID.polygon.1346], polygon[GRID.polygon.1347], polygon[GRID.polygon.1348], text[GRID.text.1349], text[GRID.text.1350], text[GRID.text.1351], text[GRID.text.1352], text[GRID.text.1353], text[GRID.text.1354], text[GRID.text.1355], text[GRID.text.1356], text[GRID.text.1357], text[GRID.text.1358])
+

Or we can export the OTU lists and make Venns with this online tool http://bioinformatics.psb.ugent.be/webtools/Venn/. This tool is handy in that is gives you the list of OTUs within the Venn sections so that you can see which specific bacteria are shared.

+
write.table(OTU.2w, "OTU.2w.csv", sep=",", row.names=FALSE, col.names=FALSE)
+write.table(OTU.8w, "OTU.8w.csv", sep=",", row.names=FALSE, col.names=FALSE)
+write.table(OTU.1yr, "OTU.1yr.csv", sep=",", row.names=FALSE, col.names=FALSE)
+
+
+

Networks

+
+

OTUs

+

You can plot the distances between OTUs as a network. It would be an unreadable mess to plot all the OTUs in our data set, so we will just use the smaller prevotella data set.

+
plot_net(prevotella, color="Species", type="taxa")
+

+

For co-occurrence networks of OTUs, I recommend Gephi or Cytoscape. Thus far, I have not found an R package comparable to these other programs.

+
+
+

Beta-diversity

+

You can also plot beta-diversity as a network where the edges (lines) are the distances between samples. All metrics we’ve used here are supported (bray, jaccard, wunifrac, uwunifrac)

+
plot_net(physeq.tree, color="AgeGroup", distance="bray")
+

+
+
+
+
+

Publication figures

+

Once you have a figure you want to include in a publication, there are a number of ways to export it out of R. You can use the “Export” function within the Plots window, but this often does not result in high enough resolution.

+

Ideally, you want to save in PostScript (.ps) or PDF (.pdf) formats because they are vector-based, meaning they are not any specific dpi and do not get blurry when zoomed in. Other formats (PNG, JPG, BMP, TIFF) are pixel-based formats (little square dots) and can become jagged when zoomed in.

+

If you have issues getting a specific font to work, try installing and loading the package extrafont.

+
+

PostScript

+

Here, we will use postscript to export as a .ps. This function uses

+
    +
  • width, height: in inches unless otherwise specified with units=
  • +
  • horizontal: TRUE = landscape, FALSE = portrait
  • +
  • colormodel: RGB, CMYK, and others
  • +
  • family: Font to be used within figures
  • +
+

Then we add layout if we have more than one plot within the overall figure.

+
    +
  • matrix: +
      +
    • A list of how many figures there are. For 2, it is c(1,2). For 4, it is c(1,2,3,4)
    • +
    • Then the number of rows, columns the figures should be oriented in
    • +
  • +
  • widths: A list of scalars of how large each figure should be in width.
  • +
  • heights: A list of scalars of how large each figure should be in height.
  • +
+
postscript("Fig1.ps", width = 7, height = 3, horizontal = FALSE, colormodel = "rgb", family = "ArialMT")
+ 
+layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1))
+ 
+plot(BC.nmds, type="n", main="Bray-Curtis")
+points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
+ 
+boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue"))
+ 
+dev.off()
+
## quartz_off_screen 
+##                 2
+

To open the resulting .ps file:

+
    +
  • Open it directly in Adobe Illustrator (vectors are preserved)
  • +
  • On a Mac, double-clicking on it will convert it automatically into a PDF and will open automatically into Preview.
  • +
  • On Windows, it depends on how “file associations” are set-up. Typically the file would need some transformation on a “standard” Windows computer before it can be used. If Adobe software is installed, it could run via Distiller to convert the .ps to a PDF.
  • +
+
+
+

PDF

+

To export directly to a PDF, we will use pdf

+
pdf("Fig1.pdf", width = 7, height = 3,  colormodel = "rgb", family = "ArialMT")
+ 
+layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1))
+ 
+plot(BC.nmds, type="n", main="Bray-Curtis")
+points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
+ 
+boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue"))
+ 
+dev.off()
+
## quartz_off_screen 
+##                 2
+
+
+

PNG

+

PNG is pixel-based so it may get blurry if not at high enough resolution. The exact resolution can be specified by giving the dpi in res=

+
png("Fig1.png", width = 7, height = 3, units='in', res=300)
+ 
+layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1))
+ 
+plot(BC.nmds, type="n", main="Bray-Curtis")
+points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
+ 
+boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue"))
+ 
+dev.off()
+
## quartz_off_screen 
+##                 2
+
+ + +
+
+
+ + + +
+
+ +
+ + + + + + + + diff --git a/Microbiota_analysis_R/Microbiota_Analysis_in_R.pdf b/Microbiota_analysis_R/Microbiota_Analysis_in_R.pdf new file mode 100644 index 0000000..3b1e3f7 Binary files /dev/null and b/Microbiota_analysis_R/Microbiota_Analysis_in_R.pdf differ diff --git a/Microbiota_analysis_R/OTU.1yr.csv b/Microbiota_analysis_R/OTU.1yr.csv new file mode 100644 index 0000000..e098400 --- /dev/null +++ b/Microbiota_analysis_R/OTU.1yr.csv @@ -0,0 +1,3496 @@ +"Otu00001" +"Otu00002" +"Otu00006" +"Otu00011" +"Otu00013" +"Otu00014" +"Otu00016" +"Otu00018" +"Otu00019" +"Otu00020" +"Otu00021" +"Otu00022" +"Otu00025" +"Otu00026" +"Otu00032" +"Otu00035" +"Otu00036" +"Otu00037" +"Otu00040" +"Otu00042" +"Otu00046" +"Otu00048" +"Otu00049" +"Otu00051" +"Otu00052" +"Otu00053" +"Otu00055" +"Otu00056" +"Otu00057" +"Otu00059" +"Otu00062" +"Otu00064" +"Otu00065" +"Otu00066" +"Otu00067" +"Otu00069" +"Otu00072" +"Otu00073" +"Otu00074" +"Otu00075" +"Otu00076" +"Otu00077" +"Otu00081" +"Otu00084" +"Otu00086" +"Otu00088" +"Otu00089" +"Otu00090" +"Otu00091" +"Otu00092" +"Otu00093" +"Otu00094" +"Otu00095" +"Otu00096" +"Otu00097" +"Otu00098" +"Otu00099" +"Otu00100" +"Otu00101" +"Otu00102" +"Otu00103" +"Otu00104" +"Otu00105" +"Otu00106" +"Otu00108" +"Otu00110" +"Otu00113" +"Otu00114" +"Otu00115" +"Otu00116" +"Otu00118" +"Otu00119" +"Otu00122" +"Otu00124" +"Otu00125" +"Otu00126" +"Otu00127" +"Otu00130" +"Otu00131" +"Otu00132" +"Otu00133" +"Otu00134" +"Otu00138" +"Otu00139" +"Otu00142" +"Otu00144" +"Otu00145" +"Otu00146" +"Otu00147" +"Otu00149" +"Otu00153" +"Otu00154" +"Otu00157" +"Otu00158" +"Otu00159" +"Otu00161" +"Otu00162" +"Otu00163" +"Otu00164" +"Otu00165" +"Otu00167" +"Otu00168" +"Otu00169" +"Otu00172" +"Otu00173" +"Otu00176" +"Otu00177" +"Otu00178" +"Otu00181" +"Otu00182" +"Otu00183" +"Otu00185" +"Otu00186" +"Otu00187" +"Otu00188" +"Otu00191" +"Otu00192" +"Otu00193" +"Otu00196" +"Otu00197" +"Otu00198" +"Otu00200" +"Otu00201" +"Otu00202" +"Otu00203" +"Otu00204" +"Otu00205" +"Otu00210" +"Otu00211" +"Otu00212" +"Otu00213" +"Otu00215" +"Otu00221" +"Otu00223" +"Otu00224" +"Otu00225" +"Otu00228" +"Otu00229" +"Otu00230" +"Otu00231" +"Otu00232" +"Otu00234" +"Otu00237" +"Otu00239" +"Otu00242" +"Otu00243" +"Otu00244" +"Otu00245" +"Otu00247" +"Otu00248" +"Otu00249" +"Otu00254" +"Otu00255" +"Otu00257" +"Otu00258" +"Otu00260" +"Otu00261" +"Otu00262" +"Otu00264" +"Otu00266" +"Otu00267" +"Otu00268" +"Otu00269" +"Otu00270" +"Otu00271" +"Otu00272" +"Otu00273" +"Otu00274" +"Otu00275" +"Otu00276" +"Otu00278" +"Otu00280" +"Otu00281" +"Otu00282" +"Otu00283" +"Otu00286" +"Otu00287" +"Otu00288" +"Otu00289" +"Otu00290" +"Otu00292" +"Otu00294" +"Otu00295" +"Otu00296" +"Otu00297" +"Otu00298" +"Otu00299" +"Otu00301" +"Otu00302" +"Otu00303" +"Otu00304" +"Otu00305" +"Otu00307" +"Otu00312" +"Otu00314" +"Otu00317" +"Otu00318" +"Otu00320" +"Otu00321" +"Otu00322" +"Otu00323" +"Otu00324" +"Otu00325" +"Otu00326" +"Otu00327" +"Otu00328" +"Otu00329" +"Otu00332" +"Otu00333" +"Otu00335" +"Otu00336" +"Otu00337" +"Otu00338" +"Otu00339" +"Otu00340" +"Otu00341" +"Otu00343" +"Otu00344" +"Otu00346" +"Otu00347" +"Otu00348" +"Otu00349" +"Otu00351" +"Otu00353" +"Otu00354" +"Otu00356" +"Otu00357" +"Otu00358" +"Otu00360" +"Otu00361" +"Otu00362" +"Otu00363" +"Otu00364" +"Otu00365" +"Otu00366" +"Otu00367" +"Otu00368" +"Otu00369" +"Otu00374" +"Otu00375" +"Otu00376" +"Otu00377" +"Otu00380" +"Otu00383" +"Otu00384" +"Otu00385" +"Otu00386" +"Otu00387" +"Otu00388" +"Otu00389" +"Otu00390" +"Otu00391" +"Otu00392" +"Otu00393" +"Otu00394" +"Otu00395" +"Otu00397" +"Otu00398" +"Otu00400" +"Otu00401" +"Otu00402" +"Otu00403" +"Otu00406" +"Otu00408" +"Otu00409" +"Otu00410" +"Otu00411" +"Otu00412" +"Otu00416" +"Otu00417" +"Otu00419" +"Otu00420" +"Otu00421" +"Otu00422" +"Otu00423" +"Otu00424" +"Otu00425" +"Otu00426" +"Otu00427" +"Otu00428" +"Otu00429" +"Otu00432" +"Otu00433" +"Otu00434" +"Otu00437" +"Otu00438" +"Otu00439" +"Otu00441" +"Otu00442" +"Otu00443" +"Otu00444" +"Otu00448" +"Otu00449" +"Otu00450" +"Otu00452" +"Otu00453" +"Otu00455" +"Otu00457" +"Otu00458" +"Otu00460" +"Otu00463" +"Otu00466" +"Otu00467" +"Otu00468" +"Otu00470" +"Otu00471" +"Otu00472" +"Otu00473" +"Otu00474" +"Otu00476" +"Otu00477" +"Otu00478" +"Otu00479" +"Otu00480" +"Otu00481" +"Otu00482" +"Otu00483" +"Otu00484" +"Otu00485" +"Otu00487" +"Otu00488" +"Otu00491" +"Otu00492" +"Otu00493" +"Otu00494" +"Otu00498" +"Otu00499" +"Otu00500" +"Otu00501" +"Otu00502" +"Otu00503" +"Otu00504" +"Otu00505" +"Otu00506" +"Otu00508" +"Otu00509" +"Otu00510" +"Otu00511" +"Otu00513" +"Otu00514" +"Otu00515" +"Otu00516" +"Otu00517" +"Otu00518" +"Otu00519" +"Otu00521" +"Otu00522" +"Otu00523" +"Otu00524" +"Otu00525" +"Otu00526" +"Otu00529" +"Otu00530" +"Otu00532" +"Otu00533" +"Otu00534" +"Otu00535" +"Otu00537" +"Otu00538" +"Otu00540" +"Otu00541" +"Otu00542" +"Otu00544" +"Otu00545" +"Otu00546" +"Otu00548" +"Otu00549" +"Otu00550" +"Otu00551" +"Otu00552" +"Otu00553" +"Otu00554" +"Otu00558" +"Otu00560" +"Otu00561" +"Otu00562" +"Otu00563" +"Otu00564" +"Otu00565" +"Otu00566" +"Otu00567" +"Otu00568" +"Otu00569" +"Otu00570" +"Otu00573" +"Otu00574" +"Otu00575" +"Otu00576" +"Otu00577" +"Otu00578" +"Otu00579" +"Otu00580" +"Otu00581" +"Otu00582" +"Otu00585" +"Otu00586" +"Otu00587" +"Otu00588" +"Otu00590" +"Otu00591" +"Otu00592" +"Otu00594" +"Otu00595" +"Otu00597" +"Otu00598" +"Otu00599" +"Otu00600" +"Otu00601" +"Otu00603" +"Otu00604" +"Otu00605" +"Otu00607" +"Otu00609" +"Otu00611" +"Otu00612" +"Otu00613" +"Otu00614" +"Otu00615" +"Otu00616" +"Otu00617" +"Otu00618" +"Otu00619" +"Otu00622" +"Otu00623" +"Otu00624" +"Otu00625" +"Otu00626" +"Otu00628" +"Otu00629" +"Otu00630" +"Otu00631" +"Otu00632" +"Otu00633" +"Otu00634" +"Otu00635" +"Otu00636" +"Otu00638" +"Otu00639" +"Otu00640" +"Otu00641" +"Otu00642" +"Otu00644" +"Otu00647" +"Otu00649" +"Otu00650" +"Otu00653" +"Otu00654" +"Otu00655" +"Otu00656" +"Otu00657" +"Otu00659" +"Otu00660" +"Otu00661" +"Otu00663" +"Otu00664" +"Otu00666" +"Otu00667" +"Otu00669" +"Otu00673" +"Otu00674" +"Otu00675" +"Otu00676" +"Otu00677" +"Otu00678" +"Otu00679" +"Otu00680" +"Otu00681" +"Otu00685" +"Otu00686" +"Otu00687" +"Otu00688" +"Otu00689" +"Otu00690" +"Otu00691" +"Otu00692" +"Otu00693" +"Otu00694" +"Otu00695" +"Otu00696" +"Otu00697" +"Otu00698" +"Otu00699" +"Otu00702" +"Otu00703" +"Otu00704" +"Otu00705" +"Otu00706" +"Otu00707" +"Otu00708" +"Otu00710" +"Otu00711" +"Otu00713" +"Otu00714" +"Otu00715" +"Otu00716" +"Otu00717" +"Otu00719" +"Otu00720" +"Otu00721" +"Otu00724" +"Otu00725" +"Otu00726" +"Otu00727" +"Otu00728" +"Otu00729" +"Otu00730" +"Otu00732" +"Otu00733" +"Otu00734" +"Otu00735" +"Otu00736" +"Otu00737" +"Otu00738" +"Otu00740" +"Otu00741" +"Otu00742" +"Otu00743" +"Otu00744" +"Otu00745" +"Otu00746" +"Otu00747" +"Otu00748" +"Otu00749" +"Otu00750" +"Otu00752" +"Otu00754" +"Otu00755" +"Otu00756" +"Otu00757" +"Otu00758" +"Otu00759" +"Otu00762" +"Otu00763" +"Otu00764" +"Otu00765" +"Otu00768" +"Otu00769" +"Otu00770" +"Otu00771" +"Otu00774" +"Otu00777" +"Otu00778" +"Otu00779" +"Otu00781" +"Otu00782" +"Otu00783" +"Otu00784" +"Otu00785" +"Otu00789" +"Otu00790" +"Otu00791" +"Otu00793" +"Otu00794" +"Otu00795" +"Otu00796" +"Otu00798" +"Otu00799" +"Otu00800" +"Otu00801" +"Otu00802" +"Otu00803" +"Otu00804" +"Otu00806" +"Otu00808" +"Otu00809" +"Otu00810" +"Otu00811" +"Otu00812" +"Otu00813" +"Otu00814" +"Otu00815" +"Otu00817" +"Otu00818" +"Otu00819" +"Otu00820" +"Otu00821" +"Otu00822" +"Otu00824" +"Otu00825" +"Otu00827" +"Otu00828" +"Otu00829" +"Otu00830" +"Otu00832" +"Otu00833" +"Otu00834" +"Otu00835" +"Otu00837" +"Otu00839" +"Otu00840" +"Otu00842" +"Otu00843" +"Otu00844" +"Otu00845" +"Otu00846" +"Otu00847" +"Otu00850" +"Otu00851" +"Otu00852" +"Otu00854" +"Otu00855" +"Otu00857" +"Otu00858" +"Otu00859" +"Otu00860" +"Otu00861" +"Otu00862" +"Otu00863" +"Otu00864" +"Otu00865" +"Otu00866" +"Otu00868" +"Otu00870" +"Otu00871" +"Otu00872" +"Otu00873" +"Otu00874" +"Otu00875" +"Otu00876" +"Otu00878" +"Otu00879" +"Otu00880" +"Otu00881" +"Otu00882" +"Otu00883" +"Otu00885" +"Otu00886" +"Otu00887" +"Otu00888" +"Otu00891" +"Otu00892" +"Otu00893" +"Otu00894" +"Otu00895" +"Otu00896" +"Otu00897" +"Otu00898" +"Otu00899" +"Otu00900" +"Otu00902" +"Otu00904" +"Otu00905" +"Otu00906" +"Otu00907" +"Otu00909" +"Otu00910" +"Otu00911" +"Otu00912" +"Otu00913" +"Otu00914" +"Otu00916" +"Otu00919" +"Otu00920" +"Otu00921" +"Otu00922" +"Otu00923" +"Otu00924" +"Otu00925" +"Otu00926" +"Otu00927" +"Otu00928" +"Otu00929" +"Otu00930" +"Otu00931" +"Otu00932" +"Otu00934" +"Otu00937" +"Otu00939" +"Otu00941" +"Otu00942" +"Otu00943" +"Otu00944" +"Otu00945" +"Otu00946" +"Otu00947" +"Otu00948" +"Otu00951" +"Otu00953" +"Otu00954" +"Otu00956" +"Otu00957" +"Otu00958" +"Otu00959" +"Otu00960" +"Otu00961" +"Otu00962" +"Otu00963" +"Otu00964" +"Otu00966" +"Otu00969" +"Otu00970" +"Otu00971" +"Otu00972" +"Otu00973" +"Otu00975" +"Otu00976" +"Otu00978" +"Otu00980" +"Otu00981" +"Otu00982" +"Otu00983" +"Otu00984" +"Otu00985" +"Otu00986" +"Otu00987" +"Otu00988" +"Otu00989" +"Otu00990" +"Otu00992" +"Otu00993" +"Otu00995" +"Otu00996" +"Otu00997" +"Otu01000" +"Otu01002" +"Otu01003" +"Otu01004" +"Otu01005" +"Otu01006" +"Otu01008" +"Otu01009" +"Otu01010" +"Otu01011" +"Otu01012" +"Otu01013" +"Otu01014" +"Otu01016" +"Otu01017" +"Otu01018" +"Otu01019" +"Otu01020" +"Otu01022" +"Otu01023" +"Otu01024" +"Otu01025" +"Otu01026" +"Otu01027" +"Otu01028" +"Otu01029" +"Otu01030" +"Otu01031" +"Otu01032" +"Otu01034" +"Otu01035" +"Otu01037" +"Otu01038" +"Otu01039" +"Otu01041" +"Otu01042" +"Otu01044" +"Otu01045" +"Otu01047" +"Otu01048" +"Otu01049" +"Otu01051" +"Otu01052" +"Otu01053" +"Otu01054" +"Otu01056" +"Otu01057" +"Otu01058" +"Otu01059" +"Otu01060" +"Otu01061" +"Otu01062" +"Otu01063" +"Otu01064" +"Otu01065" +"Otu01066" +"Otu01067" +"Otu01068" +"Otu01069" +"Otu01070" +"Otu01071" +"Otu01072" +"Otu01073" +"Otu01074" +"Otu01075" +"Otu01076" +"Otu01077" +"Otu01078" +"Otu01079" +"Otu01083" +"Otu01085" +"Otu01086" +"Otu01088" +"Otu01089" +"Otu01090" +"Otu01092" +"Otu01094" +"Otu01096" +"Otu01097" +"Otu01098" +"Otu01099" +"Otu01100" +"Otu01101" +"Otu01102" +"Otu01103" +"Otu01104" +"Otu01105" +"Otu01106" +"Otu01107" +"Otu01108" +"Otu01110" +"Otu01111" +"Otu01112" +"Otu01113" +"Otu01114" +"Otu01115" +"Otu01116" +"Otu01117" +"Otu01122" +"Otu01123" +"Otu01124" +"Otu01125" +"Otu01126" +"Otu01127" +"Otu01128" +"Otu01131" +"Otu01132" +"Otu01133" +"Otu01134" +"Otu01135" +"Otu01136" +"Otu01137" +"Otu01138" +"Otu01139" +"Otu01140" +"Otu01141" +"Otu01142" +"Otu01143" +"Otu01144" +"Otu01145" +"Otu01146" +"Otu01147" +"Otu01148" +"Otu01149" +"Otu01150" +"Otu01155" +"Otu01157" +"Otu01159" +"Otu01160" +"Otu01161" +"Otu01162" +"Otu01163" +"Otu01165" +"Otu01166" +"Otu01167" +"Otu01168" +"Otu01169" +"Otu01170" +"Otu01171" +"Otu01172" +"Otu01173" +"Otu01174" +"Otu01176" +"Otu01177" +"Otu01178" +"Otu01179" +"Otu01180" +"Otu01181" +"Otu01182" +"Otu01183" +"Otu01184" +"Otu01185" +"Otu01186" +"Otu01187" +"Otu01188" +"Otu01189" +"Otu01190" +"Otu01191" +"Otu01192" +"Otu01193" +"Otu01194" +"Otu01195" +"Otu01196" +"Otu01197" +"Otu01198" +"Otu01200" +"Otu01203" +"Otu01204" +"Otu01205" +"Otu01206" +"Otu01207" +"Otu01208" +"Otu01209" +"Otu01210" +"Otu01211" +"Otu01212" +"Otu01213" +"Otu01214" +"Otu01215" +"Otu01216" +"Otu01217" +"Otu01218" +"Otu01220" +"Otu01221" +"Otu01222" +"Otu01224" +"Otu01225" +"Otu01226" +"Otu01227" +"Otu01228" +"Otu01229" +"Otu01230" +"Otu01231" +"Otu01232" +"Otu01233" +"Otu01234" +"Otu01235" +"Otu01236" +"Otu01237" +"Otu01238" +"Otu01239" +"Otu01241" +"Otu01243" +"Otu01244" +"Otu01245" +"Otu01246" +"Otu01247" +"Otu01248" +"Otu01249" +"Otu01250" +"Otu01251" +"Otu01252" +"Otu01253" +"Otu01254" +"Otu01255" +"Otu01256" +"Otu01258" +"Otu01259" +"Otu01262" +"Otu01263" +"Otu01264" +"Otu01265" +"Otu01266" +"Otu01267" +"Otu01268" +"Otu01269" +"Otu01270" +"Otu01271" +"Otu01272" +"Otu01273" +"Otu01274" +"Otu01276" +"Otu01277" +"Otu01278" +"Otu01279" +"Otu01280" +"Otu01281" +"Otu01282" +"Otu01283" +"Otu01284" +"Otu01285" +"Otu01287" +"Otu01288" +"Otu01289" +"Otu01291" +"Otu01292" +"Otu01293" +"Otu01294" +"Otu01295" +"Otu01296" +"Otu01298" +"Otu01299" +"Otu01300" +"Otu01301" +"Otu01302" +"Otu01303" +"Otu01304" +"Otu01305" +"Otu01306" +"Otu01308" +"Otu01309" +"Otu01310" +"Otu01311" +"Otu01312" +"Otu01313" +"Otu01314" +"Otu01315" +"Otu01316" +"Otu01317" +"Otu01318" +"Otu01319" +"Otu01320" +"Otu01321" +"Otu01322" +"Otu01323" +"Otu01324" +"Otu01325" +"Otu01327" +"Otu01328" +"Otu01329" +"Otu01330" +"Otu01332" +"Otu01333" +"Otu01334" +"Otu01335" +"Otu01336" +"Otu01338" +"Otu01339" +"Otu01340" +"Otu01341" +"Otu01342" +"Otu01343" +"Otu01344" +"Otu01345" +"Otu01347" +"Otu01348" +"Otu01349" +"Otu01350" +"Otu01351" +"Otu01352" +"Otu01353" +"Otu01354" +"Otu01355" +"Otu01356" +"Otu01357" +"Otu01358" +"Otu01360" +"Otu01361" +"Otu01362" +"Otu01363" +"Otu01366" +"Otu01368" +"Otu01370" +"Otu01371" +"Otu01372" +"Otu01373" +"Otu01374" +"Otu01376" +"Otu01377" +"Otu01378" +"Otu01380" +"Otu01381" +"Otu01382" +"Otu01383" +"Otu01384" +"Otu01386" +"Otu01387" +"Otu01388" +"Otu01391" +"Otu01392" +"Otu01394" +"Otu01395" +"Otu01396" +"Otu01397" +"Otu01398" +"Otu01401" +"Otu01402" +"Otu01403" +"Otu01404" +"Otu01405" +"Otu01406" +"Otu01407" +"Otu01408" +"Otu01409" +"Otu01412" +"Otu01413" +"Otu01414" +"Otu01416" +"Otu01417" +"Otu01418" +"Otu01419" +"Otu01420" +"Otu01421" +"Otu01422" +"Otu01423" +"Otu01425" +"Otu01426" +"Otu01427" +"Otu01428" +"Otu01429" +"Otu01430" +"Otu01431" +"Otu01432" +"Otu01433" +"Otu01436" +"Otu01437" +"Otu01438" +"Otu01439" +"Otu01440" +"Otu01441" +"Otu01442" +"Otu01443" +"Otu01445" +"Otu01446" +"Otu01447" +"Otu01448" +"Otu01449" +"Otu01450" +"Otu01451" +"Otu01452" +"Otu01453" +"Otu01455" +"Otu01456" +"Otu01457" +"Otu01458" +"Otu01459" +"Otu01460" +"Otu01461" +"Otu01462" +"Otu01463" +"Otu01465" +"Otu01467" +"Otu01468" +"Otu01469" +"Otu01470" +"Otu01472" +"Otu01473" +"Otu01476" +"Otu01478" +"Otu01479" +"Otu01480" +"Otu01483" +"Otu01484" +"Otu01485" +"Otu01486" +"Otu01487" +"Otu01488" +"Otu01489" +"Otu01490" +"Otu01491" +"Otu01492" +"Otu01494" +"Otu01495" +"Otu01496" +"Otu01497" +"Otu01498" +"Otu01499" +"Otu01500" +"Otu01501" +"Otu01502" +"Otu01503" +"Otu01504" +"Otu01505" +"Otu01508" +"Otu01509" +"Otu01510" +"Otu01513" +"Otu01514" +"Otu01515" +"Otu01516" +"Otu01518" +"Otu01519" +"Otu01520" +"Otu01522" +"Otu01523" +"Otu01524" +"Otu01525" +"Otu01526" +"Otu01527" +"Otu01529" +"Otu01530" +"Otu01531" +"Otu01532" +"Otu01533" +"Otu01534" +"Otu01535" +"Otu01536" +"Otu01538" +"Otu01539" +"Otu01540" +"Otu01541" +"Otu01542" +"Otu01543" +"Otu01545" +"Otu01546" +"Otu01547" +"Otu01549" +"Otu01552" +"Otu01553" +"Otu01554" +"Otu01555" +"Otu01556" +"Otu01557" +"Otu01558" +"Otu01560" +"Otu01561" +"Otu01562" +"Otu01563" +"Otu01565" +"Otu01566" +"Otu01567" +"Otu01569" +"Otu01570" +"Otu01571" +"Otu01572" +"Otu01576" +"Otu01577" +"Otu01578" +"Otu01580" +"Otu01581" +"Otu01583" +"Otu01584" +"Otu01585" +"Otu01586" +"Otu01587" +"Otu01588" +"Otu01590" +"Otu01592" +"Otu01593" +"Otu01594" +"Otu01595" +"Otu01596" +"Otu01597" +"Otu01598" +"Otu01599" +"Otu01600" +"Otu01601" +"Otu01602" +"Otu01603" +"Otu01605" +"Otu01606" +"Otu01608" +"Otu01611" +"Otu01613" +"Otu01614" +"Otu01615" +"Otu01616" +"Otu01617" +"Otu01618" +"Otu01619" +"Otu01620" +"Otu01621" +"Otu01622" +"Otu01624" +"Otu01625" +"Otu01626" +"Otu01627" +"Otu01628" +"Otu01632" +"Otu01633" +"Otu01634" +"Otu01636" +"Otu01640" +"Otu01641" +"Otu01642" +"Otu01643" +"Otu01644" +"Otu01645" +"Otu01647" +"Otu01650" +"Otu01651" +"Otu01652" +"Otu01653" +"Otu01654" +"Otu01655" +"Otu01656" +"Otu01657" +"Otu01659" +"Otu01660" +"Otu01662" +"Otu01664" +"Otu01665" +"Otu01666" +"Otu01667" +"Otu01669" +"Otu01670" +"Otu01672" +"Otu01673" +"Otu01674" +"Otu01675" +"Otu01676" +"Otu01677" +"Otu01678" +"Otu01679" +"Otu01680" +"Otu01681" +"Otu01682" +"Otu01683" +"Otu01684" +"Otu01685" +"Otu01686" +"Otu01687" +"Otu01688" +"Otu01689" +"Otu01690" +"Otu01692" +"Otu01693" +"Otu01694" +"Otu01695" +"Otu01696" +"Otu01698" +"Otu01699" +"Otu01702" +"Otu01703" +"Otu01704" +"Otu01707" +"Otu01708" +"Otu01710" +"Otu01711" +"Otu01713" +"Otu01717" +"Otu01718" +"Otu01720" +"Otu01723" +"Otu01724" +"Otu01725" +"Otu01728" +"Otu01729" +"Otu01730" +"Otu01732" +"Otu01734" +"Otu01736" +"Otu01738" +"Otu01739" +"Otu01740" +"Otu01742" +"Otu01743" +"Otu01744" +"Otu01745" +"Otu01746" +"Otu01748" +"Otu01752" +"Otu01753" +"Otu01754" +"Otu01755" +"Otu01756" +"Otu01757" +"Otu01759" +"Otu01762" +"Otu01764" +"Otu01768" +"Otu01769" +"Otu01771" +"Otu01772" +"Otu01773" +"Otu01774" +"Otu01775" +"Otu01776" +"Otu01778" +"Otu01780" +"Otu01781" +"Otu01782" +"Otu01783" +"Otu01785" +"Otu01786" +"Otu01787" +"Otu01788" +"Otu01789" +"Otu01790" +"Otu01791" +"Otu01792" +"Otu01794" +"Otu01798" +"Otu01799" +"Otu01801" +"Otu01802" +"Otu01803" +"Otu01805" +"Otu01806" +"Otu01807" +"Otu01808" +"Otu01809" +"Otu01810" +"Otu01811" +"Otu01812" +"Otu01813" +"Otu01814" +"Otu01815" +"Otu01816" +"Otu01817" +"Otu01818" +"Otu01819" +"Otu01820" +"Otu01822" +"Otu01823" +"Otu01824" +"Otu01825" +"Otu01826" +"Otu01827" +"Otu01828" +"Otu01829" +"Otu01830" +"Otu01831" +"Otu01832" +"Otu01833" +"Otu01834" +"Otu01835" +"Otu01836" +"Otu01837" +"Otu01839" +"Otu01841" +"Otu01843" +"Otu01844" +"Otu01845" +"Otu01846" +"Otu01847" +"Otu01848" +"Otu01849" +"Otu01850" +"Otu01851" +"Otu01852" +"Otu01853" +"Otu01854" +"Otu01855" +"Otu01856" +"Otu01857" +"Otu01858" +"Otu01861" +"Otu01862" +"Otu01863" +"Otu01864" +"Otu01865" +"Otu01866" +"Otu01868" +"Otu01869" +"Otu01870" +"Otu01872" +"Otu01873" +"Otu01874" +"Otu01875" +"Otu01876" +"Otu01880" +"Otu01883" +"Otu01885" +"Otu01887" +"Otu01890" +"Otu01891" +"Otu01892" +"Otu01893" +"Otu01895" +"Otu01896" +"Otu01897" +"Otu01898" +"Otu01899" +"Otu01900" +"Otu01901" +"Otu01903" +"Otu01905" +"Otu01906" +"Otu01907" +"Otu01909" +"Otu01910" +"Otu01912" +"Otu01914" +"Otu01916" +"Otu01917" +"Otu01918" +"Otu01919" +"Otu01920" +"Otu01922" +"Otu01924" +"Otu01925" +"Otu01926" +"Otu01927" +"Otu01930" +"Otu01932" +"Otu01936" +"Otu01938" +"Otu01939" +"Otu01940" +"Otu01941" +"Otu01942" +"Otu01943" +"Otu01944" +"Otu01946" +"Otu01947" +"Otu01948" +"Otu01950" +"Otu01951" +"Otu01952" +"Otu01953" +"Otu01954" +"Otu01955" +"Otu01956" +"Otu01957" +"Otu01961" +"Otu01964" +"Otu01965" +"Otu01966" +"Otu01967" +"Otu01968" +"Otu01969" +"Otu01970" +"Otu01971" +"Otu01972" +"Otu01973" +"Otu01974" +"Otu01975" +"Otu01976" +"Otu01977" +"Otu01979" +"Otu01980" +"Otu01981" +"Otu01982" +"Otu01983" +"Otu01984" +"Otu01985" +"Otu01986" +"Otu01987" +"Otu01988" +"Otu01989" +"Otu01990" +"Otu01991" +"Otu01993" +"Otu01995" +"Otu01996" +"Otu01997" +"Otu01998" +"Otu01999" +"Otu02000" +"Otu02001" +"Otu02003" +"Otu02005" +"Otu02006" +"Otu02007" +"Otu02010" +"Otu02012" +"Otu02013" +"Otu02014" +"Otu02015" +"Otu02016" +"Otu02017" +"Otu02018" +"Otu02019" +"Otu02020" +"Otu02021" +"Otu02022" +"Otu02023" +"Otu02026" +"Otu02027" +"Otu02030" +"Otu02031" +"Otu02034" +"Otu02035" +"Otu02036" +"Otu02037" +"Otu02038" +"Otu02039" +"Otu02040" +"Otu02041" +"Otu02043" +"Otu02046" +"Otu02047" +"Otu02048" +"Otu02052" +"Otu02053" +"Otu02054" +"Otu02056" +"Otu02058" +"Otu02059" +"Otu02061" +"Otu02062" +"Otu02063" +"Otu02064" +"Otu02065" +"Otu02067" +"Otu02068" +"Otu02069" +"Otu02070" +"Otu02071" +"Otu02072" +"Otu02073" +"Otu02074" +"Otu02075" +"Otu02076" +"Otu02078" +"Otu02080" +"Otu02081" +"Otu02082" +"Otu02084" +"Otu02085" +"Otu02086" +"Otu02088" +"Otu02089" +"Otu02090" +"Otu02091" +"Otu02092" +"Otu02094" +"Otu02095" +"Otu02098" +"Otu02100" +"Otu02102" +"Otu02104" +"Otu02105" +"Otu02107" +"Otu02108" +"Otu02109" +"Otu02110" +"Otu02111" +"Otu02112" +"Otu02113" +"Otu02114" +"Otu02116" +"Otu02117" +"Otu02118" +"Otu02119" +"Otu02120" +"Otu02121" +"Otu02124" +"Otu02125" +"Otu02127" +"Otu02129" +"Otu02130" +"Otu02131" +"Otu02132" +"Otu02133" +"Otu02135" +"Otu02136" +"Otu02138" +"Otu02139" +"Otu02140" +"Otu02141" +"Otu02142" +"Otu02143" +"Otu02144" +"Otu02145" +"Otu02146" +"Otu02147" +"Otu02149" +"Otu02150" +"Otu02151" +"Otu02153" +"Otu02154" +"Otu02155" +"Otu02156" +"Otu02159" +"Otu02160" +"Otu02163" +"Otu02164" +"Otu02165" +"Otu02167" +"Otu02168" +"Otu02169" +"Otu02170" +"Otu02171" +"Otu02173" +"Otu02174" +"Otu02176" +"Otu02178" +"Otu02179" +"Otu02180" +"Otu02181" +"Otu02182" +"Otu02183" +"Otu02184" +"Otu02185" +"Otu02187" +"Otu02189" +"Otu02190" +"Otu02192" +"Otu02195" +"Otu02196" +"Otu02197" +"Otu02198" +"Otu02200" +"Otu02204" +"Otu02206" +"Otu02207" +"Otu02208" +"Otu02209" +"Otu02214" +"Otu02215" +"Otu02216" +"Otu02217" +"Otu02218" +"Otu02219" +"Otu02220" +"Otu02221" +"Otu02223" +"Otu02226" +"Otu02227" +"Otu02228" +"Otu02229" +"Otu02230" +"Otu02231" +"Otu02232" +"Otu02233" +"Otu02235" +"Otu02236" +"Otu02237" +"Otu02238" +"Otu02239" +"Otu02240" +"Otu02241" +"Otu02243" +"Otu02244" +"Otu02245" +"Otu02246" +"Otu02247" +"Otu02248" +"Otu02249" +"Otu02251" +"Otu02253" +"Otu02254" +"Otu02255" +"Otu02256" +"Otu02257" +"Otu02258" +"Otu02260" +"Otu02261" +"Otu02263" +"Otu02264" +"Otu02265" +"Otu02267" +"Otu02268" +"Otu02269" +"Otu02270" +"Otu02271" +"Otu02272" +"Otu02273" +"Otu02274" +"Otu02275" +"Otu02277" +"Otu02280" +"Otu02281" +"Otu02282" +"Otu02283" +"Otu02285" +"Otu02286" +"Otu02287" +"Otu02288" +"Otu02289" +"Otu02290" +"Otu02291" +"Otu02292" +"Otu02293" +"Otu02295" +"Otu02296" +"Otu02298" +"Otu02300" +"Otu02301" +"Otu02302" +"Otu02305" +"Otu02306" +"Otu02307" +"Otu02310" +"Otu02311" +"Otu02312" +"Otu02313" +"Otu02314" +"Otu02315" +"Otu02319" +"Otu02320" +"Otu02322" +"Otu02323" +"Otu02325" +"Otu02326" +"Otu02327" +"Otu02328" +"Otu02329" +"Otu02330" +"Otu02331" +"Otu02332" +"Otu02333" +"Otu02334" +"Otu02335" +"Otu02338" +"Otu02339" +"Otu02343" +"Otu02344" +"Otu02346" +"Otu02349" +"Otu02350" +"Otu02351" +"Otu02352" +"Otu02353" +"Otu02355" +"Otu02356" +"Otu02357" +"Otu02358" +"Otu02359" +"Otu02360" +"Otu02361" +"Otu02362" +"Otu02363" +"Otu02364" +"Otu02365" +"Otu02366" +"Otu02367" +"Otu02368" +"Otu02369" +"Otu02370" +"Otu02371" +"Otu02373" +"Otu02374" +"Otu02375" +"Otu02377" +"Otu02378" +"Otu02380" +"Otu02381" +"Otu02382" +"Otu02383" +"Otu02385" +"Otu02386" +"Otu02387" +"Otu02388" +"Otu02389" +"Otu02390" +"Otu02391" +"Otu02392" +"Otu02394" +"Otu02395" +"Otu02397" +"Otu02398" +"Otu02400" +"Otu02402" +"Otu02403" +"Otu02404" +"Otu02405" +"Otu02407" +"Otu02409" +"Otu02410" +"Otu02411" +"Otu02416" +"Otu02417" +"Otu02418" +"Otu02419" +"Otu02420" +"Otu02421" +"Otu02422" +"Otu02423" +"Otu02424" +"Otu02425" +"Otu02426" +"Otu02427" +"Otu02428" +"Otu02430" +"Otu02431" +"Otu02432" +"Otu02433" +"Otu02434" +"Otu02435" +"Otu02436" +"Otu02437" +"Otu02438" +"Otu02441" +"Otu02442" +"Otu02443" +"Otu02444" +"Otu02446" +"Otu02450" +"Otu02451" +"Otu02452" +"Otu02453" +"Otu02454" +"Otu02455" +"Otu02457" +"Otu02458" +"Otu02459" +"Otu02460" +"Otu02461" +"Otu02462" +"Otu02465" +"Otu02467" +"Otu02468" +"Otu02469" +"Otu02470" +"Otu02471" +"Otu02472" +"Otu02473" +"Otu02474" +"Otu02475" +"Otu02476" +"Otu02477" +"Otu02478" +"Otu02479" +"Otu02480" +"Otu02481" +"Otu02482" +"Otu02483" +"Otu02484" +"Otu02485" +"Otu02486" +"Otu02487" +"Otu02489" +"Otu02490" +"Otu02491" +"Otu02494" +"Otu02495" +"Otu02499" +"Otu02500" +"Otu02501" +"Otu02502" +"Otu02503" +"Otu02504" +"Otu02506" +"Otu02508" +"Otu02510" +"Otu02511" +"Otu02512" +"Otu02513" +"Otu02514" +"Otu02515" +"Otu02517" +"Otu02518" +"Otu02520" +"Otu02521" +"Otu02522" +"Otu02524" +"Otu02525" +"Otu02526" +"Otu02527" +"Otu02528" +"Otu02529" +"Otu02530" +"Otu02531" +"Otu02532" +"Otu02533" +"Otu02534" +"Otu02535" +"Otu02536" +"Otu02538" +"Otu02540" +"Otu02541" +"Otu02542" +"Otu02543" +"Otu02547" +"Otu02551" +"Otu02552" +"Otu02553" +"Otu02554" +"Otu02555" +"Otu02556" +"Otu02557" +"Otu02558" +"Otu02560" +"Otu02561" +"Otu02562" +"Otu02563" +"Otu02564" +"Otu02565" +"Otu02567" +"Otu02568" +"Otu02569" +"Otu02572" +"Otu02573" +"Otu02574" +"Otu02575" +"Otu02576" +"Otu02577" +"Otu02578" +"Otu02579" +"Otu02583" +"Otu02584" +"Otu02586" +"Otu02587" +"Otu02589" +"Otu02591" +"Otu02594" +"Otu02595" +"Otu02596" +"Otu02597" +"Otu02598" +"Otu02599" +"Otu02600" +"Otu02602" +"Otu02603" +"Otu02604" +"Otu02606" +"Otu02609" +"Otu02610" +"Otu02611" +"Otu02612" +"Otu02613" +"Otu02614" +"Otu02615" +"Otu02616" +"Otu02617" +"Otu02618" +"Otu02620" +"Otu02621" +"Otu02622" +"Otu02623" +"Otu02624" +"Otu02626" +"Otu02627" +"Otu02628" +"Otu02629" +"Otu02630" +"Otu02631" +"Otu02632" +"Otu02633" +"Otu02635" +"Otu02637" +"Otu02639" +"Otu02640" +"Otu02641" +"Otu02642" +"Otu02644" +"Otu02645" +"Otu02646" +"Otu02647" +"Otu02650" +"Otu02651" +"Otu02653" +"Otu02655" +"Otu02656" +"Otu02658" +"Otu02659" +"Otu02660" +"Otu02663" +"Otu02666" +"Otu02667" +"Otu02669" +"Otu02670" +"Otu02673" +"Otu02674" +"Otu02675" +"Otu02677" +"Otu02678" +"Otu02679" +"Otu02680" +"Otu02681" +"Otu02682" +"Otu02684" +"Otu02685" +"Otu02687" +"Otu02689" +"Otu02691" +"Otu02693" +"Otu02694" +"Otu02698" +"Otu02699" +"Otu02701" +"Otu02702" +"Otu02705" +"Otu02706" +"Otu02708" +"Otu02710" +"Otu02714" +"Otu02715" +"Otu02717" +"Otu02718" +"Otu02719" +"Otu02720" +"Otu02722" +"Otu02723" +"Otu02724" +"Otu02725" +"Otu02726" +"Otu02727" +"Otu02728" +"Otu02730" +"Otu02731" +"Otu02734" +"Otu02735" +"Otu02736" +"Otu02737" +"Otu02739" +"Otu02740" +"Otu02742" +"Otu02743" +"Otu02744" +"Otu02745" +"Otu02746" +"Otu02747" +"Otu02748" +"Otu02750" +"Otu02752" +"Otu02753" +"Otu02756" +"Otu02758" +"Otu02759" +"Otu02760" +"Otu02761" +"Otu02762" +"Otu02763" +"Otu02765" +"Otu02766" +"Otu02767" +"Otu02768" +"Otu02769" +"Otu02771" +"Otu02772" +"Otu02773" +"Otu02774" +"Otu02775" +"Otu02777" +"Otu02778" +"Otu02780" +"Otu02781" +"Otu02782" +"Otu02783" +"Otu02784" +"Otu02786" +"Otu02787" +"Otu02792" +"Otu02793" +"Otu02794" +"Otu02797" +"Otu02799" +"Otu02800" +"Otu02801" +"Otu02802" +"Otu02805" +"Otu02806" +"Otu02808" +"Otu02812" +"Otu02815" +"Otu02816" +"Otu02817" +"Otu02818" +"Otu02819" +"Otu02820" +"Otu02821" +"Otu02822" +"Otu02823" +"Otu02825" +"Otu02829" +"Otu02831" +"Otu02833" +"Otu02834" +"Otu02835" +"Otu02836" +"Otu02837" +"Otu02838" +"Otu02840" +"Otu02841" +"Otu02842" +"Otu02845" +"Otu02848" +"Otu02850" +"Otu02851" +"Otu02852" +"Otu02854" +"Otu02855" +"Otu02858" +"Otu02860" +"Otu02864" +"Otu02865" +"Otu02867" +"Otu02868" +"Otu02869" +"Otu02870" +"Otu02871" +"Otu02872" +"Otu02873" +"Otu02876" +"Otu02878" +"Otu02879" +"Otu02880" +"Otu02881" +"Otu02882" +"Otu02883" +"Otu02884" +"Otu02885" +"Otu02886" +"Otu02887" +"Otu02889" +"Otu02890" +"Otu02891" +"Otu02892" +"Otu02893" +"Otu02897" +"Otu02898" +"Otu02899" +"Otu02901" +"Otu02902" +"Otu02903" +"Otu02904" +"Otu02905" +"Otu02906" +"Otu02908" +"Otu02910" +"Otu02912" +"Otu02914" +"Otu02915" +"Otu02917" +"Otu02918" +"Otu02920" +"Otu02921" +"Otu02922" +"Otu02923" +"Otu02925" +"Otu02926" +"Otu02927" +"Otu02928" +"Otu02932" +"Otu02933" +"Otu02934" +"Otu02935" +"Otu02936" +"Otu02938" +"Otu02939" +"Otu02943" +"Otu02945" +"Otu02947" +"Otu02948" +"Otu02949" +"Otu02950" +"Otu02952" +"Otu02953" +"Otu02955" +"Otu02956" +"Otu02958" +"Otu02959" +"Otu02960" +"Otu02962" +"Otu02963" +"Otu02966" +"Otu02967" +"Otu02968" +"Otu02969" +"Otu02970" +"Otu02972" +"Otu02973" +"Otu02974" +"Otu02975" +"Otu02976" +"Otu02977" +"Otu02978" +"Otu02979" +"Otu02980" +"Otu02982" +"Otu02983" +"Otu02984" +"Otu02987" +"Otu02988" +"Otu02989" +"Otu02990" +"Otu02991" +"Otu02993" +"Otu02995" +"Otu02997" +"Otu02998" +"Otu02999" +"Otu03001" +"Otu03002" +"Otu03003" +"Otu03004" +"Otu03007" +"Otu03008" +"Otu03009" +"Otu03010" +"Otu03012" +"Otu03015" +"Otu03017" +"Otu03018" +"Otu03020" +"Otu03021" +"Otu03022" +"Otu03024" +"Otu03025" +"Otu03026" +"Otu03027" +"Otu03028" +"Otu03032" +"Otu03035" +"Otu03036" +"Otu03037" +"Otu03038" +"Otu03039" +"Otu03042" +"Otu03043" +"Otu03046" +"Otu03047" +"Otu03048" +"Otu03049" +"Otu03050" +"Otu03051" +"Otu03053" +"Otu03054" +"Otu03055" +"Otu03056" +"Otu03057" +"Otu03059" +"Otu03060" +"Otu03062" +"Otu03063" +"Otu03064" +"Otu03065" +"Otu03066" +"Otu03067" +"Otu03068" +"Otu03069" +"Otu03070" +"Otu03072" +"Otu03073" +"Otu03075" +"Otu03078" +"Otu03080" +"Otu03082" +"Otu03083" +"Otu03085" +"Otu03088" +"Otu03090" +"Otu03094" +"Otu03096" +"Otu03097" +"Otu03098" +"Otu03100" +"Otu03101" +"Otu03103" +"Otu03105" +"Otu03106" +"Otu03108" +"Otu03110" +"Otu03112" +"Otu03113" +"Otu03114" +"Otu03115" +"Otu03119" +"Otu03121" +"Otu03122" +"Otu03124" +"Otu03128" +"Otu03129" +"Otu03132" +"Otu03133" +"Otu03134" +"Otu03138" +"Otu03139" +"Otu03140" +"Otu03142" +"Otu03143" +"Otu03144" +"Otu03145" +"Otu03146" +"Otu03147" +"Otu03148" +"Otu03150" +"Otu03152" +"Otu03153" +"Otu03155" +"Otu03156" +"Otu03158" +"Otu03159" +"Otu03161" +"Otu03162" +"Otu03163" +"Otu03166" +"Otu03171" +"Otu03173" +"Otu03175" +"Otu03177" +"Otu03178" +"Otu03180" +"Otu03181" +"Otu03183" +"Otu03184" +"Otu03185" +"Otu03186" +"Otu03187" +"Otu03188" +"Otu03190" +"Otu03191" +"Otu03192" +"Otu03193" +"Otu03194" +"Otu03195" +"Otu03196" +"Otu03197" +"Otu03199" +"Otu03200" +"Otu03204" +"Otu03205" +"Otu03206" +"Otu03207" +"Otu03208" +"Otu03210" +"Otu03211" +"Otu03213" +"Otu03214" +"Otu03217" +"Otu03218" +"Otu03219" +"Otu03220" +"Otu03222" +"Otu03223" +"Otu03224" +"Otu03225" +"Otu03227" +"Otu03228" +"Otu03229" +"Otu03230" +"Otu03231" +"Otu03234" +"Otu03236" +"Otu03237" +"Otu03238" +"Otu03240" +"Otu03243" +"Otu03244" +"Otu03245" +"Otu03246" +"Otu03248" +"Otu03249" +"Otu03250" +"Otu03253" +"Otu03257" +"Otu03258" +"Otu03261" +"Otu03262" +"Otu03265" +"Otu03269" +"Otu03270" +"Otu03271" +"Otu03272" +"Otu03273" +"Otu03274" +"Otu03275" +"Otu03276" +"Otu03279" +"Otu03281" +"Otu03282" +"Otu03284" +"Otu03285" +"Otu03286" +"Otu03287" +"Otu03288" +"Otu03289" +"Otu03291" +"Otu03292" +"Otu03295" +"Otu03296" +"Otu03297" +"Otu03298" +"Otu03299" +"Otu03300" +"Otu03301" +"Otu03304" +"Otu03305" +"Otu03306" +"Otu03308" +"Otu03309" +"Otu03312" +"Otu03313" +"Otu03314" +"Otu03317" +"Otu03318" +"Otu03319" +"Otu03320" +"Otu03321" +"Otu03322" +"Otu03323" +"Otu03324" +"Otu03325" +"Otu03326" +"Otu03327" +"Otu03329" +"Otu03330" +"Otu03332" +"Otu03334" +"Otu03335" +"Otu03336" +"Otu03337" +"Otu03340" +"Otu03341" +"Otu03342" +"Otu03343" +"Otu03344" +"Otu03345" +"Otu03346" +"Otu03347" +"Otu03348" +"Otu03349" +"Otu03350" +"Otu03351" +"Otu03352" +"Otu03354" +"Otu03355" +"Otu03356" +"Otu03358" +"Otu03359" +"Otu03360" +"Otu03362" +"Otu03369" +"Otu03373" +"Otu03377" +"Otu03378" +"Otu03380" +"Otu03382" +"Otu03384" +"Otu03385" +"Otu03386" +"Otu03388" +"Otu03389" +"Otu03390" +"Otu03391" +"Otu03393" +"Otu03395" +"Otu03397" +"Otu03398" +"Otu03399" +"Otu03400" +"Otu03402" +"Otu03406" +"Otu03407" +"Otu03408" +"Otu03411" +"Otu03414" +"Otu03419" +"Otu03420" +"Otu03421" +"Otu03422" +"Otu03424" +"Otu03425" +"Otu03426" +"Otu03429" +"Otu03430" +"Otu03431" +"Otu03432" +"Otu03433" +"Otu03434" +"Otu03435" +"Otu03436" +"Otu03439" +"Otu03441" +"Otu03442" +"Otu03444" +"Otu03445" +"Otu03446" +"Otu03447" +"Otu03448" +"Otu03449" +"Otu03452" +"Otu03453" +"Otu03456" +"Otu03459" +"Otu03461" +"Otu03462" +"Otu03463" +"Otu03464" +"Otu03465" +"Otu03466" +"Otu03467" +"Otu03470" +"Otu03471" +"Otu03474" +"Otu03475" +"Otu03476" +"Otu03478" +"Otu03479" +"Otu03480" +"Otu03481" +"Otu03482" +"Otu03483" +"Otu03484" +"Otu03485" +"Otu03487" +"Otu03488" +"Otu03489" +"Otu03490" +"Otu03491" +"Otu03492" +"Otu03495" +"Otu03496" +"Otu03497" +"Otu03499" +"Otu03500" +"Otu03501" +"Otu03502" +"Otu03503" +"Otu03504" +"Otu03505" +"Otu03506" +"Otu03508" +"Otu03510" +"Otu03511" +"Otu03512" +"Otu03513" +"Otu03516" +"Otu03517" +"Otu03519" +"Otu03520" +"Otu03522" +"Otu03523" +"Otu03526" +"Otu03527" +"Otu03528" +"Otu03530" +"Otu03531" +"Otu03532" +"Otu03533" +"Otu03534" +"Otu03535" +"Otu03536" +"Otu03537" +"Otu03538" +"Otu03541" +"Otu03542" +"Otu03543" +"Otu03544" +"Otu03546" +"Otu03547" +"Otu03548" +"Otu03551" +"Otu03552" +"Otu03555" +"Otu03556" +"Otu03557" +"Otu03558" +"Otu03559" +"Otu03560" +"Otu03561" +"Otu03563" +"Otu03564" +"Otu03566" +"Otu03567" +"Otu03568" +"Otu03570" +"Otu03571" +"Otu03572" +"Otu03574" +"Otu03575" +"Otu03578" +"Otu03579" +"Otu03580" +"Otu03582" +"Otu03583" +"Otu03584" +"Otu03587" +"Otu03590" +"Otu03591" +"Otu03592" +"Otu03593" +"Otu03595" +"Otu03599" +"Otu03600" +"Otu03602" +"Otu03603" +"Otu03604" +"Otu03605" +"Otu03607" +"Otu03609" +"Otu03617" +"Otu03618" +"Otu03619" +"Otu03621" +"Otu03624" +"Otu03625" +"Otu03626" +"Otu03628" +"Otu03630" +"Otu03633" +"Otu03635" +"Otu03636" +"Otu03640" +"Otu03641" +"Otu03642" +"Otu03643" +"Otu03644" +"Otu03645" +"Otu03646" +"Otu03647" +"Otu03650" +"Otu03651" +"Otu03652" +"Otu03653" +"Otu03655" +"Otu03657" +"Otu03660" +"Otu03661" +"Otu03662" +"Otu03663" +"Otu03664" +"Otu03668" +"Otu03670" +"Otu03671" +"Otu03672" +"Otu03674" +"Otu03675" +"Otu03677" +"Otu03678" +"Otu03679" +"Otu03680" +"Otu03682" +"Otu03683" +"Otu03684" +"Otu03685" +"Otu03687" +"Otu03688" +"Otu03689" +"Otu03690" +"Otu03691" +"Otu03692" +"Otu03694" +"Otu03695" +"Otu03696" +"Otu03697" +"Otu03698" +"Otu03699" +"Otu03700" +"Otu03702" +"Otu03703" +"Otu03704" +"Otu03705" +"Otu03708" +"Otu03709" +"Otu03710" +"Otu03711" +"Otu03712" +"Otu03715" +"Otu03716" +"Otu03717" +"Otu03718" +"Otu03722" +"Otu03723" +"Otu03725" +"Otu03728" +"Otu03729" +"Otu03730" +"Otu03731" +"Otu03734" +"Otu03736" +"Otu03737" +"Otu03738" +"Otu03739" +"Otu03741" +"Otu03743" +"Otu03744" +"Otu03749" +"Otu03750" +"Otu03756" +"Otu03758" +"Otu03759" +"Otu03761" +"Otu03762" +"Otu03763" +"Otu03765" +"Otu03766" +"Otu03770" +"Otu03772" +"Otu03773" +"Otu03774" +"Otu03776" +"Otu03777" +"Otu03778" +"Otu03779" +"Otu03781" +"Otu03783" +"Otu03785" +"Otu03786" +"Otu03787" +"Otu03788" +"Otu03789" +"Otu03790" +"Otu03791" +"Otu03792" +"Otu03794" +"Otu03795" +"Otu03798" +"Otu03799" +"Otu03800" +"Otu03801" +"Otu03806" +"Otu03808" +"Otu03809" +"Otu03810" +"Otu03811" +"Otu03812" +"Otu03813" +"Otu03814" +"Otu03815" +"Otu03816" +"Otu03817" +"Otu03818" +"Otu03819" +"Otu03820" +"Otu03821" +"Otu03822" +"Otu03823" +"Otu03824" +"Otu03826" +"Otu03827" +"Otu03831" +"Otu03834" +"Otu03836" +"Otu03838" +"Otu03843" +"Otu03845" +"Otu03847" +"Otu03850" +"Otu03851" +"Otu03852" +"Otu03858" +"Otu03860" +"Otu03861" +"Otu03862" +"Otu03864" +"Otu03865" +"Otu03868" +"Otu03869" +"Otu03871" +"Otu03872" +"Otu03877" +"Otu03879" +"Otu03883" +"Otu03884" +"Otu03885" +"Otu03887" +"Otu03890" +"Otu03891" +"Otu03894" +"Otu03895" +"Otu03896" +"Otu03899" +"Otu03905" +"Otu03906" +"Otu03907" +"Otu03909" +"Otu03911" +"Otu03912" +"Otu03913" +"Otu03914" +"Otu03915" +"Otu03916" +"Otu03918" +"Otu03920" +"Otu03923" +"Otu03924" +"Otu03925" +"Otu03928" +"Otu03932" +"Otu03933" +"Otu03934" +"Otu03936" +"Otu03938" +"Otu03939" +"Otu03940" +"Otu03942" +"Otu03945" +"Otu03947" +"Otu03949" +"Otu03953" +"Otu03954" +"Otu03955" +"Otu03956" +"Otu03957" +"Otu03961" +"Otu03962" +"Otu03963" +"Otu03966" +"Otu03967" +"Otu03968" +"Otu03971" +"Otu03972" +"Otu03974" +"Otu03976" +"Otu03977" +"Otu03979" +"Otu03985" +"Otu03986" +"Otu03987" +"Otu03988" +"Otu03994" +"Otu03996" +"Otu03997" +"Otu04000" +"Otu04001" +"Otu04002" +"Otu04004" +"Otu04008" +"Otu04009" +"Otu04012" +"Otu04015" +"Otu04016" +"Otu04019" +"Otu04020" +"Otu04021" +"Otu04022" +"Otu04023" +"Otu04024" +"Otu04025" +"Otu04027" +"Otu04028" +"Otu04030" +"Otu04032" +"Otu04036" +"Otu04038" +"Otu04039" +"Otu04040" +"Otu04041" +"Otu04042" +"Otu04043" +"Otu04045" +"Otu04046" +"Otu04047" +"Otu04050" +"Otu04051" +"Otu04052" +"Otu04053" +"Otu04054" +"Otu04055" +"Otu04058" +"Otu04060" +"Otu04061" +"Otu04062" +"Otu04066" +"Otu04067" +"Otu04068" +"Otu04069" +"Otu04070" +"Otu04071" +"Otu04072" +"Otu04074" +"Otu04075" +"Otu04077" +"Otu04078" +"Otu04080" +"Otu04081" +"Otu04082" +"Otu04083" +"Otu04086" +"Otu04088" +"Otu04091" +"Otu04092" +"Otu04096" +"Otu04097" +"Otu04098" +"Otu04100" +"Otu04102" +"Otu04103" +"Otu04104" +"Otu04106" +"Otu04107" +"Otu04108" +"Otu04109" +"Otu04111" +"Otu04112" +"Otu04113" +"Otu04114" +"Otu04115" +"Otu04116" +"Otu04117" +"Otu04118" +"Otu04119" +"Otu04120" +"Otu04121" +"Otu04124" +"Otu04125" +"Otu04128" +"Otu04131" +"Otu04143" +"Otu04144" +"Otu04145" +"Otu04146" +"Otu04147" +"Otu04148" +"Otu04151" +"Otu04153" +"Otu04155" +"Otu04157" +"Otu04162" +"Otu04163" +"Otu04164" +"Otu04166" +"Otu04168" +"Otu04173" +"Otu04178" +"Otu04183" +"Otu04184" +"Otu04187" +"Otu04192" +"Otu04193" +"Otu04195" +"Otu04202" +"Otu04214" +"Otu04215" +"Otu04219" +"Otu04225" +"Otu04226" +"Otu04227" +"Otu04228" +"Otu04230" +"Otu04231" +"Otu04232" +"Otu04233" +"Otu04234" +"Otu04235" +"Otu04236" +"Otu04237" +"Otu04239" +"Otu04245" +"Otu04247" +"Otu04253" +"Otu04256" +"Otu04257" +"Otu04260" +"Otu04262" +"Otu04265" +"Otu04267" +"Otu04268" +"Otu04272" +"Otu04275" +"Otu04276" +"Otu04282" +"Otu04284" +"Otu04286" +"Otu04291" +"Otu04292" +"Otu04294" +"Otu04295" +"Otu04298" +"Otu04299" +"Otu04300" +"Otu04301" +"Otu04302" +"Otu04303" +"Otu04304" +"Otu04307" +"Otu04308" +"Otu04309" +"Otu04310" +"Otu04315" +"Otu04317" +"Otu04320" +"Otu04322" +"Otu04323" +"Otu04324" +"Otu04328" +"Otu04338" +"Otu04343" +"Otu04345" +"Otu04346" +"Otu04348" +"Otu04349" +"Otu04350" +"Otu04351" +"Otu04353" +"Otu04355" +"Otu04356" +"Otu04357" +"Otu04358" +"Otu04362" +"Otu04364" +"Otu04366" +"Otu04368" +"Otu04369" +"Otu04370" +"Otu04371" +"Otu04372" +"Otu04373" +"Otu04374" +"Otu04375" +"Otu04377" +"Otu04378" +"Otu04381" +"Otu04383" +"Otu04385" +"Otu04389" +"Otu04390" +"Otu04391" +"Otu04392" +"Otu04394" +"Otu04395" +"Otu04396" +"Otu04397" +"Otu04399" +"Otu04402" +"Otu04403" +"Otu04404" +"Otu04405" +"Otu04406" +"Otu04407" +"Otu04408" +"Otu04409" +"Otu04415" +"Otu04416" +"Otu04417" +"Otu04419" +"Otu04423" +"Otu04424" +"Otu04426" +"Otu04427" +"Otu04429" +"Otu04430" +"Otu04432" +"Otu04433" +"Otu04434" +"Otu04435" +"Otu04436" +"Otu04437" +"Otu04439" +"Otu04440" +"Otu04443" +"Otu04444" +"Otu04446" +"Otu04449" +"Otu04453" +"Otu04454" +"Otu04456" +"Otu04459" +"Otu04462" +"Otu04463" +"Otu04464" +"Otu04465" +"Otu04467" +"Otu04468" +"Otu04470" +"Otu04471" +"Otu04472" +"Otu04473" +"Otu04475" +"Otu04478" +"Otu04480" +"Otu04483" +"Otu04488" +"Otu04491" +"Otu04492" +"Otu04494" +"Otu04496" +"Otu04497" +"Otu04498" +"Otu04500" +"Otu04501" +"Otu04503" +"Otu04506" +"Otu04507" +"Otu04508" +"Otu04509" +"Otu04511" +"Otu04513" +"Otu04516" +"Otu04518" +"Otu04519" +"Otu04522" +"Otu04523" +"Otu04524" +"Otu04525" +"Otu04526" +"Otu04530" +"Otu04531" +"Otu04532" +"Otu04533" +"Otu04534" +"Otu04536" +"Otu04537" +"Otu04538" +"Otu04539" +"Otu04540" +"Otu04545" +"Otu04547" +"Otu04552" +"Otu04553" +"Otu04554" +"Otu04555" +"Otu04557" +"Otu04558" +"Otu04559" +"Otu04560" +"Otu04561" +"Otu04562" +"Otu04563" +"Otu04564" +"Otu04565" +"Otu04566" +"Otu04567" +"Otu04568" +"Otu04569" +"Otu04570" +"Otu04571" +"Otu04572" +"Otu04573" +"Otu04575" +"Otu04576" +"Otu04577" +"Otu04578" +"Otu04579" +"Otu04580" +"Otu04581" +"Otu04582" +"Otu04583" +"Otu04584" +"Otu04587" +"Otu04589" +"Otu04590" +"Otu04591" +"Otu04594" +"Otu04595" +"Otu04596" +"Otu04597" +"Otu04598" +"Otu04599" +"Otu04605" +"Otu04606" +"Otu04608" +"Otu04609" +"Otu04610" +"Otu04611" +"Otu04612" +"Otu04614" +"Otu04615" +"Otu04617" +"Otu04619" +"Otu04620" +"Otu04622" +"Otu04625" +"Otu04629" +"Otu04630" +"Otu04631" +"Otu04636" +"Otu04638" +"Otu04640" +"Otu04641" +"Otu04645" +"Otu04650" +"Otu04653" +"Otu04657" +"Otu04658" +"Otu04659" +"Otu04660" +"Otu04662" +"Otu04664" +"Otu04666" +"Otu04667" +"Otu04668" +"Otu04669" +"Otu04671" +"Otu04674" +"Otu04675" +"Otu04685" +"Otu04686" +"Otu04688" +"Otu04689" +"Otu04690" +"Otu04691" +"Otu04693" +"Otu04694" +"Otu04696" +"Otu04697" +"Otu04698" +"Otu04699" +"Otu04700" +"Otu04701" +"Otu04702" +"Otu04703" +"Otu04705" +"Otu04706" +"Otu04709" +"Otu04712" +"Otu04713" +"Otu04715" +"Otu04717" +"Otu04719" +"Otu04722" +"Otu04724" +"Otu04725" +"Otu04726" +"Otu04727" +"Otu04728" +"Otu04729" +"Otu04731" +"Otu04732" +"Otu04733" +"Otu04735" +"Otu04736" +"Otu04737" +"Otu04738" +"Otu04739" +"Otu04741" +"Otu04742" +"Otu04743" +"Otu04745" +"Otu04747" +"Otu04748" +"Otu04750" +"Otu04751" +"Otu04752" +"Otu04753" +"Otu04754" +"Otu04756" +"Otu04759" +"Otu04760" +"Otu04761" +"Otu04763" +"Otu04764" +"Otu04765" +"Otu04767" +"Otu04769" +"Otu04774" +"Otu04775" +"Otu04776" +"Otu04777" +"Otu04779" +"Otu04780" +"Otu04781" +"Otu04784" +"Otu04786" +"Otu04787" +"Otu04788" +"Otu04792" +"Otu04793" +"Otu04794" +"Otu04796" +"Otu04799" +"Otu04801" +"Otu04803" +"Otu04804" +"Otu04806" +"Otu04807" +"Otu04809" +"Otu04810" +"Otu04811" +"Otu04812" +"Otu04839" +"Otu04870" +"Otu04876" +"Otu04884" +"Otu04916" +"Otu04928" +"Otu04947" +"Otu04949" +"Otu04997" +"Otu05007" +"Otu05026" +"Otu05043" +"Otu05078" +"Otu05116" +"Otu05121" +"Otu05142" +"Otu05145" +"Otu05158" +"Otu05159" +"Otu05186" +"Otu05222" +"Otu05271" +"Otu05275" +"Otu05276" +"Otu05294" +"Otu05296" +"Otu05297" +"Otu05302" +"Otu05309" +"Otu05327" +"Otu05329" +"Otu05330" +"Otu05331" +"Otu05332" +"Otu05333" +"Otu05334" +"Otu05335" +"Otu05336" +"Otu05337" +"Otu05338" +"Otu05339" +"Otu05340" +"Otu05341" +"Otu05342" +"Otu05343" +"Otu05344" +"Otu05345" +"Otu05348" +"Otu05349" +"Otu05350" +"Otu05354" +"Otu05355" +"Otu05369" +"Otu05371" +"Otu05372" +"Otu05374" +"Otu05383" +"Otu05390" +"Otu05395" +"Otu05408" +"Otu05410" +"Otu05411" +"Otu05413" +"Otu05420" +"Otu05428" +"Otu05434" +"Otu05438" +"Otu05441" +"Otu05447" +"Otu05449" +"Otu05451" +"Otu05454" +"Otu05463" +"Otu05464" +"Otu05477" +"Otu05498" +"Otu05499" +"Otu05507" +"Otu05515" +"Otu05519" +"Otu05520" +"Otu05546" +"Otu05558" +"Otu05571" +"Otu05639" +"Otu05640" +"Otu05642" +"Otu05651" +"Otu05657" +"Otu05661" +"Otu05663" +"Otu05665" +"Otu05689" +"Otu05697" +"Otu05744" +"Otu05746" +"Otu05749" +"Otu05759" +"Otu05763" +"Otu05767" +"Otu05775" +"Otu05781" +"Otu05791" +"Otu05800" +"Otu05806" +"Otu05818" +"Otu05822" +"Otu05834" +"Otu05835" +"Otu05836" +"Otu05841" +"Otu05854" +"Otu05855" +"Otu05856" +"Otu05857" +"Otu05895" +"Otu05896" +"Otu05897" +"Otu05898" +"Otu05899" +"Otu05900" +"Otu05901" +"Otu05943" +"Otu05950" +"Otu05957" +"Otu05958" +"Otu05961" +"Otu05989" +"Otu06015" +"Otu06030" +"Otu06031" +"Otu06034" +"Otu06051" +"Otu06052" +"Otu06053" +"Otu06059" +"Otu06064" +"Otu06068" +"Otu06073" +"Otu06083" +"Otu06084" +"Otu06086" +"Otu06087" +"Otu06088" +"Otu06101" +"Otu06107" +"Otu06108" +"Otu06109" +"Otu06111" +"Otu06112" +"Otu06119" +"Otu06123" +"Otu06134" +"Otu06135" +"Otu06137" +"Otu06141" +"Otu06142" +"Otu06147" +"Otu06150" +"Otu06157" +"Otu06160" +"Otu06161" +"Otu06167" +"Otu06168" +"Otu06169" +"Otu06171" +"Otu06173" +"Otu06174" +"Otu06176" +"Otu06177" +"Otu06178" +"Otu06179" +"Otu06181" +"Otu06183" +"Otu06184" +"Otu06185" +"Otu06192" +"Otu06205" +"Otu06206" +"Otu06207" +"Otu06208" +"Otu06209" +"Otu06210" +"Otu06212" +"Otu06214" +"Otu06215" +"Otu06216" +"Otu06217" +"Otu06218" +"Otu06221" +"Otu06246" +"Otu06250" +"Otu06258" +"Otu06268" +"Otu06270" +"Otu06273" +"Otu06275" +"Otu06292" +"Otu06294" +"Otu06296" +"Otu06300" +"Otu06303" +"Otu06304" +"Otu06309" +"Otu06312" +"Otu06313" +"Otu06324" +"Otu06325" diff --git a/Microbiota_analysis_R/OTU.2w.csv b/Microbiota_analysis_R/OTU.2w.csv new file mode 100644 index 0000000..e596335 --- /dev/null +++ b/Microbiota_analysis_R/OTU.2w.csv @@ -0,0 +1,834 @@ +"Otu00001" +"Otu00002" +"Otu00003" +"Otu00004" +"Otu00005" +"Otu00006" +"Otu00007" +"Otu00008" +"Otu00009" +"Otu00010" +"Otu00011" +"Otu00012" +"Otu00013" +"Otu00014" +"Otu00017" +"Otu00019" +"Otu00020" +"Otu00021" +"Otu00022" +"Otu00023" +"Otu00024" +"Otu00026" +"Otu00027" +"Otu00028" +"Otu00030" +"Otu00031" +"Otu00032" +"Otu00033" +"Otu00034" +"Otu00035" +"Otu00037" +"Otu00039" +"Otu00041" +"Otu00044" +"Otu00045" +"Otu00047" +"Otu00048" +"Otu00050" +"Otu00053" +"Otu00056" +"Otu00057" +"Otu00059" +"Otu00060" +"Otu00062" +"Otu00063" +"Otu00064" +"Otu00065" +"Otu00066" +"Otu00068" +"Otu00071" +"Otu00072" +"Otu00078" +"Otu00080" +"Otu00081" +"Otu00082" +"Otu00085" +"Otu00089" +"Otu00090" +"Otu00096" +"Otu00100" +"Otu00101" +"Otu00105" +"Otu00107" +"Otu00108" +"Otu00109" +"Otu00111" +"Otu00112" +"Otu00117" +"Otu00118" +"Otu00120" +"Otu00123" +"Otu00124" +"Otu00126" +"Otu00128" +"Otu00130" +"Otu00137" +"Otu00139" +"Otu00140" +"Otu00141" +"Otu00143" +"Otu00145" +"Otu00148" +"Otu00149" +"Otu00150" +"Otu00152" +"Otu00155" +"Otu00160" +"Otu00161" +"Otu00162" +"Otu00166" +"Otu00172" +"Otu00174" +"Otu00175" +"Otu00176" +"Otu00177" +"Otu00179" +"Otu00183" +"Otu00184" +"Otu00185" +"Otu00187" +"Otu00188" +"Otu00189" +"Otu00190" +"Otu00191" +"Otu00195" +"Otu00197" +"Otu00199" +"Otu00200" +"Otu00206" +"Otu00207" +"Otu00208" +"Otu00214" +"Otu00215" +"Otu00216" +"Otu00217" +"Otu00219" +"Otu00220" +"Otu00226" +"Otu00227" +"Otu00233" +"Otu00238" +"Otu00241" +"Otu00246" +"Otu00250" +"Otu00252" +"Otu00256" +"Otu00259" +"Otu00262" +"Otu00263" +"Otu00268" +"Otu00274" +"Otu00279" +"Otu00280" +"Otu00285" +"Otu00291" +"Otu00293" +"Otu00306" +"Otu00308" +"Otu00310" +"Otu00311" +"Otu00313" +"Otu00315" +"Otu00316" +"Otu00319" +"Otu00320" +"Otu00330" +"Otu00331" +"Otu00334" +"Otu00335" +"Otu00346" +"Otu00347" +"Otu00352" +"Otu00355" +"Otu00369" +"Otu00370" +"Otu00371" +"Otu00372" +"Otu00373" +"Otu00381" +"Otu00391" +"Otu00392" +"Otu00394" +"Otu00399" +"Otu00404" +"Otu00405" +"Otu00412" +"Otu00413" +"Otu00415" +"Otu00427" +"Otu00431" +"Otu00445" +"Otu00451" +"Otu00455" +"Otu00459" +"Otu00460" +"Otu00461" +"Otu00464" +"Otu00465" +"Otu00475" +"Otu00486" +"Otu00489" +"Otu00496" +"Otu00497" +"Otu00520" +"Otu00528" +"Otu00529" +"Otu00536" +"Otu00543" +"Otu00560" +"Otu00562" +"Otu00583" +"Otu00589" +"Otu00593" +"Otu00596" +"Otu00599" +"Otu00606" +"Otu00608" +"Otu00610" +"Otu00618" +"Otu00620" +"Otu00621" +"Otu00628" +"Otu00637" +"Otu00643" +"Otu00645" +"Otu00646" +"Otu00651" +"Otu00653" +"Otu00671" +"Otu00682" +"Otu00683" +"Otu00700" +"Otu00701" +"Otu00702" +"Otu00712" +"Otu00722" +"Otu00723" +"Otu00731" +"Otu00733" +"Otu00753" +"Otu00758" +"Otu00761" +"Otu00766" +"Otu00773" +"Otu00775" +"Otu00777" +"Otu00788" +"Otu00793" +"Otu00795" +"Otu00803" +"Otu00807" +"Otu00811" +"Otu00816" +"Otu00826" +"Otu00831" +"Otu00838" +"Otu00848" +"Otu00849" +"Otu00853" +"Otu00877" +"Otu00884" +"Otu00889" +"Otu00890" +"Otu00897" +"Otu00898" +"Otu00899" +"Otu00901" +"Otu00903" +"Otu00918" +"Otu00932" +"Otu00933" +"Otu00935" +"Otu00936" +"Otu00938" +"Otu00940" +"Otu00949" +"Otu00955" +"Otu00960" +"Otu00972" +"Otu00977" +"Otu01008" +"Otu01015" +"Otu01021" +"Otu01033" +"Otu01036" +"Otu01040" +"Otu01046" +"Otu01050" +"Otu01055" +"Otu01080" +"Otu01082" +"Otu01093" +"Otu01109" +"Otu01118" +"Otu01119" +"Otu01124" +"Otu01143" +"Otu01151" +"Otu01152" +"Otu01161" +"Otu01164" +"Otu01175" +"Otu01188" +"Otu01199" +"Otu01240" +"Otu01242" +"Otu01257" +"Otu01261" +"Otu01286" +"Otu01297" +"Otu01315" +"Otu01331" +"Otu01352" +"Otu01359" +"Otu01364" +"Otu01369" +"Otu01389" +"Otu01393" +"Otu01394" +"Otu01400" +"Otu01411" +"Otu01434" +"Otu01435" +"Otu01454" +"Otu01471" +"Otu01481" +"Otu01482" +"Otu01506" +"Otu01507" +"Otu01512" +"Otu01516" +"Otu01537" +"Otu01544" +"Otu01548" +"Otu01551" +"Otu01564" +"Otu01568" +"Otu01573" +"Otu01574" +"Otu01575" +"Otu01580" +"Otu01589" +"Otu01591" +"Otu01604" +"Otu01607" +"Otu01628" +"Otu01629" +"Otu01630" +"Otu01631" +"Otu01635" +"Otu01638" +"Otu01648" +"Otu01658" +"Otu01661" +"Otu01668" +"Otu01671" +"Otu01689" +"Otu01697" +"Otu01719" +"Otu01721" +"Otu01722" +"Otu01726" +"Otu01727" +"Otu01729" +"Otu01737" +"Otu01741" +"Otu01748" +"Otu01749" +"Otu01750" +"Otu01765" +"Otu01767" +"Otu01770" +"Otu01779" +"Otu01786" +"Otu01793" +"Otu01795" +"Otu01796" +"Otu01797" +"Otu01800" +"Otu01842" +"Otu01859" +"Otu01860" +"Otu01867" +"Otu01877" +"Otu01878" +"Otu01886" +"Otu01888" +"Otu01889" +"Otu01902" +"Otu01904" +"Otu01908" +"Otu01928" +"Otu01937" +"Otu01949" +"Otu01958" +"Otu01960" +"Otu01962" +"Otu01963" +"Otu01992" +"Otu01995" +"Otu01999" +"Otu02004" +"Otu02011" +"Otu02024" +"Otu02028" +"Otu02029" +"Otu02032" +"Otu02033" +"Otu02036" +"Otu02044" +"Otu02058" +"Otu02060" +"Otu02066" +"Otu02077" +"Otu02080" +"Otu02083" +"Otu02093" +"Otu02097" +"Otu02099" +"Otu02101" +"Otu02106" +"Otu02115" +"Otu02126" +"Otu02128" +"Otu02134" +"Otu02137" +"Otu02148" +"Otu02155" +"Otu02158" +"Otu02161" +"Otu02162" +"Otu02166" +"Otu02172" +"Otu02175" +"Otu02188" +"Otu02193" +"Otu02194" +"Otu02198" +"Otu02201" +"Otu02202" +"Otu02222" +"Otu02224" +"Otu02225" +"Otu02227" +"Otu02234" +"Otu02269" +"Otu02276" +"Otu02279" +"Otu02299" +"Otu02317" +"Otu02318" +"Otu02321" +"Otu02324" +"Otu02335" +"Otu02336" +"Otu02340" +"Otu02342" +"Otu02347" +"Otu02354" +"Otu02362" +"Otu02367" +"Otu02376" +"Otu02384" +"Otu02399" +"Otu02406" +"Otu02408" +"Otu02422" +"Otu02429" +"Otu02439" +"Otu02440" +"Otu02445" +"Otu02448" +"Otu02464" +"Otu02488" +"Otu02493" +"Otu02509" +"Otu02519" +"Otu02528" +"Otu02531" +"Otu02539" +"Otu02540" +"Otu02545" +"Otu02566" +"Otu02570" +"Otu02582" +"Otu02588" +"Otu02593" +"Otu02608" +"Otu02609" +"Otu02619" +"Otu02625" +"Otu02648" +"Otu02649" +"Otu02654" +"Otu02664" +"Otu02671" +"Otu02672" +"Otu02686" +"Otu02688" +"Otu02690" +"Otu02692" +"Otu02703" +"Otu02711" +"Otu02729" +"Otu02732" +"Otu02733" +"Otu02738" +"Otu02741" +"Otu02749" +"Otu02754" +"Otu02770" +"Otu02779" +"Otu02789" +"Otu02790" +"Otu02807" +"Otu02809" +"Otu02811" +"Otu02813" +"Otu02823" +"Otu02826" +"Otu02827" +"Otu02828" +"Otu02830" +"Otu02832" +"Otu02843" +"Otu02844" +"Otu02856" +"Otu02859" +"Otu02883" +"Otu02907" +"Otu02911" +"Otu02915" +"Otu02916" +"Otu02931" +"Otu02937" +"Otu02941" +"Otu02944" +"Otu02946" +"Otu02954" +"Otu02957" +"Otu02961" +"Otu02964" +"Otu02971" +"Otu02981" +"Otu02985" +"Otu02986" +"Otu03005" +"Otu03011" +"Otu03013" +"Otu03014" +"Otu03030" +"Otu03031" +"Otu03033" +"Otu03034" +"Otu03040" +"Otu03041" +"Otu03058" +"Otu03067" +"Otu03074" +"Otu03081" +"Otu03086" +"Otu03089" +"Otu03093" +"Otu03099" +"Otu03104" +"Otu03117" +"Otu03118" +"Otu03119" +"Otu03125" +"Otu03136" +"Otu03149" +"Otu03179" +"Otu03182" +"Otu03189" +"Otu03196" +"Otu03202" +"Otu03203" +"Otu03226" +"Otu03235" +"Otu03239" +"Otu03241" +"Otu03252" +"Otu03255" +"Otu03260" +"Otu03264" +"Otu03266" +"Otu03277" +"Otu03290" +"Otu03293" +"Otu03303" +"Otu03311" +"Otu03338" +"Otu03339" +"Otu03361" +"Otu03366" +"Otu03368" +"Otu03372" +"Otu03374" +"Otu03379" +"Otu03383" +"Otu03396" +"Otu03403" +"Otu03415" +"Otu03437" +"Otu03440" +"Otu03443" +"Otu03453" +"Otu03493" +"Otu03498" +"Otu03515" +"Otu03521" +"Otu03545" +"Otu03554" +"Otu03573" +"Otu03576" +"Otu03581" +"Otu03585" +"Otu03589" +"Otu03596" +"Otu03608" +"Otu03610" +"Otu03615" +"Otu03616" +"Otu03627" +"Otu03629" +"Otu03632" +"Otu03634" +"Otu03637" +"Otu03638" +"Otu03639" +"Otu03658" +"Otu03681" +"Otu03701" +"Otu03719" +"Otu03720" +"Otu03732" +"Otu03733" +"Otu03746" +"Otu03748" +"Otu03753" +"Otu03755" +"Otu03757" +"Otu03760" +"Otu03767" +"Otu03768" +"Otu03771" +"Otu03804" +"Otu03805" +"Otu03828" +"Otu03867" +"Otu03888" +"Otu03902" +"Otu03917" +"Otu03980" +"Otu03989" +"Otu03993" +"Otu04011" +"Otu04013" +"Otu04029" +"Otu04064" +"Otu04076" +"Otu04087" +"Otu04127" +"Otu04130" +"Otu04133" +"Otu04137" +"Otu04167" +"Otu04169" +"Otu04177" +"Otu04181" +"Otu04185" +"Otu04191" +"Otu04198" +"Otu04241" +"Otu04271" +"Otu04273" +"Otu04278" +"Otu04280" +"Otu04285" +"Otu04288" +"Otu04296" +"Otu04316" +"Otu04319" +"Otu04326" +"Otu04330" +"Otu04337" +"Otu04341" +"Otu04342" +"Otu04354" +"Otu04363" +"Otu04376" +"Otu04379" +"Otu04380" +"Otu04382" +"Otu04401" +"Otu04418" +"Otu04425" +"Otu04431" +"Otu04445" +"Otu04450" +"Otu04455" +"Otu04479" +"Otu04481" +"Otu04482" +"Otu04484" +"Otu04489" +"Otu04495" +"Otu04543" +"Otu04548" +"Otu04549" +"Otu04600" +"Otu04601" +"Otu04617" +"Otu04623" +"Otu04624" +"Otu04627" +"Otu04637" +"Otu04639" +"Otu04642" +"Otu04654" +"Otu04665" +"Otu04679" +"Otu04692" +"Otu04723" +"Otu04746" +"Otu04758" +"Otu04773" +"Otu04785" +"Otu04791" +"Otu04795" +"Otu04797" +"Otu04805" +"Otu04901" +"Otu04937" +"Otu04990" +"Otu05016" +"Otu05029" +"Otu05083" +"Otu05085" +"Otu05086" +"Otu05088" +"Otu05103" +"Otu05109" +"Otu05110" +"Otu05111" +"Otu05114" +"Otu05132" +"Otu05133" +"Otu05134" +"Otu05135" +"Otu05137" +"Otu05138" +"Otu05139" +"Otu05146" +"Otu05147" +"Otu05148" +"Otu05149" +"Otu05262" +"Otu05266" +"Otu05267" +"Otu05289" +"Otu05290" +"Otu05291" +"Otu05292" +"Otu05293" +"Otu05365" +"Otu05424" +"Otu05425" +"Otu05479" +"Otu05481" +"Otu05486" +"Otu05491" +"Otu05510" +"Otu05512" +"Otu05550" +"Otu05569" +"Otu05570" +"Otu05572" +"Otu05573" +"Otu05574" +"Otu05575" +"Otu05576" +"Otu05578" +"Otu05580" +"Otu05581" +"Otu05584" +"Otu05585" +"Otu05593" +"Otu05602" +"Otu05606" +"Otu05608" +"Otu05609" +"Otu05612" +"Otu05617" +"Otu05619" +"Otu05621" +"Otu05624" +"Otu05629" +"Otu05631" +"Otu05632" +"Otu05633" +"Otu05634" +"Otu05635" +"Otu05682" +"Otu05715" +"Otu05725" +"Otu05727" +"Otu05730" +"Otu05811" +"Otu05838" +"Otu05844" +"Otu05845" +"Otu05847" +"Otu05848" +"Otu05849" +"Otu05850" +"Otu05851" +"Otu05861" +"Otu05867" +"Otu05876" +"Otu05877" +"Otu05878" +"Otu05879" +"Otu05881" +"Otu05882" +"Otu05883" +"Otu05885" +"Otu05886" +"Otu05887" +"Otu05888" +"Otu05889" +"Otu05903" +"Otu05904" +"Otu05905" +"Otu05906" +"Otu05907" +"Otu05908" +"Otu05910" +"Otu05912" +"Otu05932" +"Otu06038" +"Otu06090" +"Otu06091" +"Otu06092" +"Otu06094" +"Otu06097" +"Otu06098" +"Otu06100" +"Otu06263" +"Otu06264" +"Otu06265" +"Otu06266" +"Otu06267" +"Otu06307" diff --git a/Microbiota_analysis_R/OTU.8w.csv b/Microbiota_analysis_R/OTU.8w.csv new file mode 100644 index 0000000..d4d697b --- /dev/null +++ b/Microbiota_analysis_R/OTU.8w.csv @@ -0,0 +1,1382 @@ +"Otu00001" +"Otu00002" +"Otu00003" +"Otu00004" +"Otu00005" +"Otu00006" +"Otu00007" +"Otu00008" +"Otu00009" +"Otu00010" +"Otu00011" +"Otu00012" +"Otu00013" +"Otu00014" +"Otu00015" +"Otu00017" +"Otu00018" +"Otu00019" +"Otu00020" +"Otu00022" +"Otu00023" +"Otu00024" +"Otu00026" +"Otu00027" +"Otu00028" +"Otu00029" +"Otu00030" +"Otu00031" +"Otu00032" +"Otu00033" +"Otu00034" +"Otu00038" +"Otu00039" +"Otu00041" +"Otu00043" +"Otu00044" +"Otu00045" +"Otu00047" +"Otu00048" +"Otu00050" +"Otu00051" +"Otu00053" +"Otu00054" +"Otu00057" +"Otu00058" +"Otu00059" +"Otu00060" +"Otu00061" +"Otu00063" +"Otu00064" +"Otu00065" +"Otu00068" +"Otu00070" +"Otu00071" +"Otu00072" +"Otu00078" +"Otu00079" +"Otu00080" +"Otu00081" +"Otu00082" +"Otu00083" +"Otu00084" +"Otu00085" +"Otu00086" +"Otu00087" +"Otu00089" +"Otu00090" +"Otu00092" +"Otu00095" +"Otu00100" +"Otu00101" +"Otu00107" +"Otu00109" +"Otu00111" +"Otu00112" +"Otu00114" +"Otu00117" +"Otu00120" +"Otu00121" +"Otu00122" +"Otu00123" +"Otu00126" +"Otu00127" +"Otu00128" +"Otu00129" +"Otu00130" +"Otu00132" +"Otu00135" +"Otu00136" +"Otu00137" +"Otu00140" +"Otu00141" +"Otu00143" +"Otu00145" +"Otu00148" +"Otu00149" +"Otu00150" +"Otu00151" +"Otu00152" +"Otu00153" +"Otu00155" +"Otu00156" +"Otu00159" +"Otu00160" +"Otu00161" +"Otu00162" +"Otu00170" +"Otu00171" +"Otu00172" +"Otu00173" +"Otu00174" +"Otu00175" +"Otu00176" +"Otu00177" +"Otu00179" +"Otu00180" +"Otu00184" +"Otu00188" +"Otu00189" +"Otu00190" +"Otu00194" +"Otu00195" +"Otu00196" +"Otu00197" +"Otu00198" +"Otu00199" +"Otu00200" +"Otu00206" +"Otu00207" +"Otu00208" +"Otu00209" +"Otu00213" +"Otu00214" +"Otu00215" +"Otu00216" +"Otu00217" +"Otu00218" +"Otu00219" +"Otu00220" +"Otu00221" +"Otu00222" +"Otu00226" +"Otu00227" +"Otu00233" +"Otu00235" +"Otu00236" +"Otu00238" +"Otu00239" +"Otu00240" +"Otu00241" +"Otu00246" +"Otu00250" +"Otu00251" +"Otu00252" +"Otu00253" +"Otu00256" +"Otu00258" +"Otu00259" +"Otu00262" +"Otu00263" +"Otu00265" +"Otu00268" +"Otu00270" +"Otu00273" +"Otu00274" +"Otu00277" +"Otu00279" +"Otu00282" +"Otu00284" +"Otu00285" +"Otu00291" +"Otu00292" +"Otu00293" +"Otu00296" +"Otu00300" +"Otu00306" +"Otu00308" +"Otu00309" +"Otu00310" +"Otu00311" +"Otu00313" +"Otu00314" +"Otu00315" +"Otu00316" +"Otu00319" +"Otu00326" +"Otu00330" +"Otu00331" +"Otu00335" +"Otu00338" +"Otu00342" +"Otu00345" +"Otu00350" +"Otu00351" +"Otu00352" +"Otu00354" +"Otu00355" +"Otu00357" +"Otu00359" +"Otu00360" +"Otu00367" +"Otu00368" +"Otu00369" +"Otu00370" +"Otu00371" +"Otu00372" +"Otu00373" +"Otu00374" +"Otu00378" +"Otu00379" +"Otu00381" +"Otu00382" +"Otu00394" +"Otu00395" +"Otu00396" +"Otu00399" +"Otu00402" +"Otu00404" +"Otu00405" +"Otu00407" +"Otu00413" +"Otu00414" +"Otu00415" +"Otu00418" +"Otu00423" +"Otu00427" +"Otu00428" +"Otu00430" +"Otu00431" +"Otu00435" +"Otu00436" +"Otu00439" +"Otu00440" +"Otu00445" +"Otu00446" +"Otu00447" +"Otu00448" +"Otu00451" +"Otu00454" +"Otu00455" +"Otu00456" +"Otu00458" +"Otu00459" +"Otu00460" +"Otu00461" +"Otu00462" +"Otu00463" +"Otu00464" +"Otu00465" +"Otu00469" +"Otu00474" +"Otu00475" +"Otu00477" +"Otu00478" +"Otu00480" +"Otu00486" +"Otu00489" +"Otu00490" +"Otu00495" +"Otu00496" +"Otu00497" +"Otu00500" +"Otu00507" +"Otu00512" +"Otu00517" +"Otu00527" +"Otu00528" +"Otu00530" +"Otu00531" +"Otu00536" +"Otu00539" +"Otu00541" +"Otu00542" +"Otu00543" +"Otu00544" +"Otu00547" +"Otu00549" +"Otu00553" +"Otu00555" +"Otu00556" +"Otu00557" +"Otu00559" +"Otu00562" +"Otu00563" +"Otu00569" +"Otu00571" +"Otu00572" +"Otu00582" +"Otu00583" +"Otu00584" +"Otu00585" +"Otu00588" +"Otu00589" +"Otu00590" +"Otu00593" +"Otu00596" +"Otu00602" +"Otu00606" +"Otu00608" +"Otu00610" +"Otu00616" +"Otu00618" +"Otu00620" +"Otu00621" +"Otu00624" +"Otu00627" +"Otu00628" +"Otu00637" +"Otu00643" +"Otu00645" +"Otu00646" +"Otu00648" +"Otu00651" +"Otu00652" +"Otu00658" +"Otu00660" +"Otu00662" +"Otu00665" +"Otu00668" +"Otu00670" +"Otu00671" +"Otu00672" +"Otu00682" +"Otu00684" +"Otu00689" +"Otu00700" +"Otu00701" +"Otu00702" +"Otu00705" +"Otu00709" +"Otu00718" +"Otu00719" +"Otu00722" +"Otu00725" +"Otu00731" +"Otu00733" +"Otu00739" +"Otu00748" +"Otu00751" +"Otu00753" +"Otu00758" +"Otu00760" +"Otu00761" +"Otu00764" +"Otu00766" +"Otu00767" +"Otu00770" +"Otu00771" +"Otu00772" +"Otu00773" +"Otu00775" +"Otu00776" +"Otu00780" +"Otu00786" +"Otu00787" +"Otu00792" +"Otu00793" +"Otu00795" +"Otu00797" +"Otu00798" +"Otu00799" +"Otu00801" +"Otu00805" +"Otu00807" +"Otu00809" +"Otu00811" +"Otu00813" +"Otu00816" +"Otu00819" +"Otu00820" +"Otu00823" +"Otu00830" +"Otu00831" +"Otu00836" +"Otu00841" +"Otu00849" +"Otu00853" +"Otu00856" +"Otu00867" +"Otu00869" +"Otu00877" +"Otu00879" +"Otu00884" +"Otu00886" +"Otu00889" +"Otu00890" +"Otu00897" +"Otu00898" +"Otu00901" +"Otu00906" +"Otu00908" +"Otu00913" +"Otu00915" +"Otu00917" +"Otu00918" +"Otu00925" +"Otu00932" +"Otu00933" +"Otu00934" +"Otu00935" +"Otu00936" +"Otu00938" +"Otu00940" +"Otu00943" +"Otu00949" +"Otu00950" +"Otu00952" +"Otu00955" +"Otu00960" +"Otu00965" +"Otu00967" +"Otu00968" +"Otu00972" +"Otu00974" +"Otu00977" +"Otu00979" +"Otu00991" +"Otu00994" +"Otu00998" +"Otu00999" +"Otu01001" +"Otu01007" +"Otu01008" +"Otu01009" +"Otu01012" +"Otu01014" +"Otu01015" +"Otu01020" +"Otu01021" +"Otu01023" +"Otu01033" +"Otu01036" +"Otu01038" +"Otu01040" +"Otu01043" +"Otu01046" +"Otu01050" +"Otu01055" +"Otu01078" +"Otu01080" +"Otu01081" +"Otu01082" +"Otu01084" +"Otu01087" +"Otu01089" +"Otu01091" +"Otu01093" +"Otu01095" +"Otu01120" +"Otu01121" +"Otu01122" +"Otu01124" +"Otu01125" +"Otu01129" +"Otu01130" +"Otu01140" +"Otu01151" +"Otu01152" +"Otu01153" +"Otu01154" +"Otu01156" +"Otu01158" +"Otu01161" +"Otu01164" +"Otu01167" +"Otu01172" +"Otu01175" +"Otu01177" +"Otu01179" +"Otu01199" +"Otu01201" +"Otu01202" +"Otu01219" +"Otu01220" +"Otu01223" +"Otu01232" +"Otu01234" +"Otu01238" +"Otu01239" +"Otu01240" +"Otu01242" +"Otu01260" +"Otu01261" +"Otu01272" +"Otu01275" +"Otu01283" +"Otu01286" +"Otu01290" +"Otu01292" +"Otu01296" +"Otu01297" +"Otu01307" +"Otu01308" +"Otu01322" +"Otu01326" +"Otu01331" +"Otu01337" +"Otu01341" +"Otu01346" +"Otu01351" +"Otu01353" +"Otu01354" +"Otu01359" +"Otu01364" +"Otu01365" +"Otu01367" +"Otu01369" +"Otu01375" +"Otu01379" +"Otu01384" +"Otu01385" +"Otu01389" +"Otu01390" +"Otu01393" +"Otu01399" +"Otu01407" +"Otu01410" +"Otu01411" +"Otu01415" +"Otu01421" +"Otu01422" +"Otu01424" +"Otu01435" +"Otu01444" +"Otu01445" +"Otu01446" +"Otu01451" +"Otu01454" +"Otu01456" +"Otu01460" +"Otu01463" +"Otu01464" +"Otu01466" +"Otu01471" +"Otu01474" +"Otu01475" +"Otu01477" +"Otu01481" +"Otu01482" +"Otu01488" +"Otu01493" +"Otu01500" +"Otu01506" +"Otu01507" +"Otu01511" +"Otu01512" +"Otu01513" +"Otu01517" +"Otu01518" +"Otu01520" +"Otu01521" +"Otu01523" +"Otu01528" +"Otu01537" +"Otu01544" +"Otu01548" +"Otu01550" +"Otu01551" +"Otu01559" +"Otu01564" +"Otu01566" +"Otu01568" +"Otu01573" +"Otu01574" +"Otu01575" +"Otu01579" +"Otu01580" +"Otu01582" +"Otu01584" +"Otu01586" +"Otu01589" +"Otu01591" +"Otu01597" +"Otu01600" +"Otu01604" +"Otu01607" +"Otu01609" +"Otu01610" +"Otu01611" +"Otu01612" +"Otu01615" +"Otu01623" +"Otu01624" +"Otu01630" +"Otu01635" +"Otu01637" +"Otu01638" +"Otu01639" +"Otu01646" +"Otu01649" +"Otu01658" +"Otu01661" +"Otu01663" +"Otu01671" +"Otu01689" +"Otu01691" +"Otu01692" +"Otu01700" +"Otu01701" +"Otu01704" +"Otu01705" +"Otu01706" +"Otu01708" +"Otu01709" +"Otu01712" +"Otu01714" +"Otu01715" +"Otu01716" +"Otu01719" +"Otu01722" +"Otu01723" +"Otu01727" +"Otu01729" +"Otu01731" +"Otu01733" +"Otu01735" +"Otu01737" +"Otu01747" +"Otu01749" +"Otu01750" +"Otu01751" +"Otu01758" +"Otu01760" +"Otu01761" +"Otu01763" +"Otu01765" +"Otu01766" +"Otu01767" +"Otu01770" +"Otu01777" +"Otu01779" +"Otu01784" +"Otu01785" +"Otu01786" +"Otu01787" +"Otu01793" +"Otu01795" +"Otu01804" +"Otu01821" +"Otu01827" +"Otu01838" +"Otu01840" +"Otu01842" +"Otu01843" +"Otu01852" +"Otu01858" +"Otu01859" +"Otu01860" +"Otu01871" +"Otu01877" +"Otu01879" +"Otu01881" +"Otu01882" +"Otu01884" +"Otu01886" +"Otu01888" +"Otu01889" +"Otu01892" +"Otu01894" +"Otu01904" +"Otu01911" +"Otu01913" +"Otu01915" +"Otu01921" +"Otu01923" +"Otu01928" +"Otu01929" +"Otu01931" +"Otu01933" +"Otu01934" +"Otu01935" +"Otu01937" +"Otu01945" +"Otu01949" +"Otu01958" +"Otu01959" +"Otu01962" +"Otu01963" +"Otu01971" +"Otu01978" +"Otu01992" +"Otu01993" +"Otu01994" +"Otu01995" +"Otu02002" +"Otu02008" +"Otu02009" +"Otu02011" +"Otu02025" +"Otu02033" +"Otu02036" +"Otu02042" +"Otu02044" +"Otu02045" +"Otu02046" +"Otu02049" +"Otu02050" +"Otu02051" +"Otu02055" +"Otu02057" +"Otu02077" +"Otu02079" +"Otu02083" +"Otu02087" +"Otu02096" +"Otu02097" +"Otu02099" +"Otu02103" +"Otu02109" +"Otu02113" +"Otu02116" +"Otu02120" +"Otu02122" +"Otu02123" +"Otu02133" +"Otu02137" +"Otu02148" +"Otu02152" +"Otu02157" +"Otu02158" +"Otu02162" +"Otu02175" +"Otu02177" +"Otu02186" +"Otu02189" +"Otu02191" +"Otu02193" +"Otu02199" +"Otu02201" +"Otu02203" +"Otu02205" +"Otu02210" +"Otu02211" +"Otu02212" +"Otu02213" +"Otu02220" +"Otu02224" +"Otu02226" +"Otu02236" +"Otu02242" +"Otu02250" +"Otu02252" +"Otu02259" +"Otu02262" +"Otu02264" +"Otu02266" +"Otu02269" +"Otu02276" +"Otu02278" +"Otu02279" +"Otu02284" +"Otu02294" +"Otu02295" +"Otu02297" +"Otu02299" +"Otu02303" +"Otu02304" +"Otu02308" +"Otu02309" +"Otu02311" +"Otu02316" +"Otu02317" +"Otu02318" +"Otu02321" +"Otu02324" +"Otu02330" +"Otu02337" +"Otu02341" +"Otu02342" +"Otu02343" +"Otu02344" +"Otu02345" +"Otu02348" +"Otu02354" +"Otu02367" +"Otu02372" +"Otu02379" +"Otu02390" +"Otu02393" +"Otu02396" +"Otu02401" +"Otu02406" +"Otu02408" +"Otu02412" +"Otu02413" +"Otu02414" +"Otu02415" +"Otu02429" +"Otu02445" +"Otu02447" +"Otu02449" +"Otu02456" +"Otu02463" +"Otu02464" +"Otu02466" +"Otu02478" +"Otu02480" +"Otu02481" +"Otu02487" +"Otu02488" +"Otu02492" +"Otu02493" +"Otu02496" +"Otu02497" +"Otu02498" +"Otu02505" +"Otu02506" +"Otu02507" +"Otu02509" +"Otu02516" +"Otu02523" +"Otu02527" +"Otu02537" +"Otu02544" +"Otu02546" +"Otu02548" +"Otu02549" +"Otu02550" +"Otu02559" +"Otu02566" +"Otu02570" +"Otu02571" +"Otu02580" +"Otu02581" +"Otu02582" +"Otu02585" +"Otu02590" +"Otu02592" +"Otu02593" +"Otu02601" +"Otu02605" +"Otu02607" +"Otu02608" +"Otu02626" +"Otu02634" +"Otu02636" +"Otu02638" +"Otu02643" +"Otu02649" +"Otu02652" +"Otu02657" +"Otu02661" +"Otu02662" +"Otu02664" +"Otu02665" +"Otu02668" +"Otu02669" +"Otu02676" +"Otu02683" +"Otu02692" +"Otu02695" +"Otu02696" +"Otu02697" +"Otu02700" +"Otu02704" +"Otu02707" +"Otu02709" +"Otu02711" +"Otu02712" +"Otu02713" +"Otu02716" +"Otu02721" +"Otu02724" +"Otu02729" +"Otu02732" +"Otu02736" +"Otu02741" +"Otu02751" +"Otu02754" +"Otu02755" +"Otu02757" +"Otu02764" +"Otu02773" +"Otu02776" +"Otu02785" +"Otu02788" +"Otu02790" +"Otu02791" +"Otu02795" +"Otu02796" +"Otu02798" +"Otu02803" +"Otu02804" +"Otu02810" +"Otu02811" +"Otu02813" +"Otu02814" +"Otu02818" +"Otu02824" +"Otu02826" +"Otu02827" +"Otu02828" +"Otu02830" +"Otu02839" +"Otu02840" +"Otu02844" +"Otu02846" +"Otu02847" +"Otu02849" +"Otu02853" +"Otu02857" +"Otu02861" +"Otu02862" +"Otu02863" +"Otu02866" +"Otu02869" +"Otu02874" +"Otu02875" +"Otu02877" +"Otu02888" +"Otu02894" +"Otu02895" +"Otu02896" +"Otu02899" +"Otu02900" +"Otu02907" +"Otu02909" +"Otu02911" +"Otu02913" +"Otu02915" +"Otu02919" +"Otu02920" +"Otu02924" +"Otu02929" +"Otu02930" +"Otu02937" +"Otu02940" +"Otu02942" +"Otu02951" +"Otu02961" +"Otu02965" +"Otu02971" +"Otu02972" +"Otu02981" +"Otu02992" +"Otu02994" +"Otu02996" +"Otu03000" +"Otu03006" +"Otu03014" +"Otu03016" +"Otu03019" +"Otu03023" +"Otu03029" +"Otu03030" +"Otu03033" +"Otu03044" +"Otu03045" +"Otu03052" +"Otu03058" +"Otu03061" +"Otu03074" +"Otu03076" +"Otu03077" +"Otu03079" +"Otu03087" +"Otu03091" +"Otu03095" +"Otu03099" +"Otu03102" +"Otu03107" +"Otu03109" +"Otu03111" +"Otu03116" +"Otu03120" +"Otu03123" +"Otu03126" +"Otu03127" +"Otu03130" +"Otu03135" +"Otu03137" +"Otu03141" +"Otu03149" +"Otu03151" +"Otu03154" +"Otu03157" +"Otu03160" +"Otu03164" +"Otu03165" +"Otu03167" +"Otu03168" +"Otu03169" +"Otu03170" +"Otu03172" +"Otu03176" +"Otu03189" +"Otu03193" +"Otu03198" +"Otu03201" +"Otu03212" +"Otu03216" +"Otu03226" +"Otu03232" +"Otu03233" +"Otu03242" +"Otu03247" +"Otu03251" +"Otu03252" +"Otu03254" +"Otu03256" +"Otu03259" +"Otu03260" +"Otu03263" +"Otu03267" +"Otu03268" +"Otu03278" +"Otu03280" +"Otu03283" +"Otu03294" +"Otu03302" +"Otu03307" +"Otu03310" +"Otu03311" +"Otu03315" +"Otu03316" +"Otu03328" +"Otu03331" +"Otu03333" +"Otu03357" +"Otu03364" +"Otu03365" +"Otu03367" +"Otu03371" +"Otu03375" +"Otu03376" +"Otu03392" +"Otu03394" +"Otu03401" +"Otu03405" +"Otu03409" +"Otu03410" +"Otu03412" +"Otu03413" +"Otu03416" +"Otu03417" +"Otu03418" +"Otu03423" +"Otu03427" +"Otu03428" +"Otu03434" +"Otu03443" +"Otu03450" +"Otu03454" +"Otu03457" +"Otu03458" +"Otu03460" +"Otu03468" +"Otu03469" +"Otu03472" +"Otu03473" +"Otu03477" +"Otu03486" +"Otu03507" +"Otu03509" +"Otu03514" +"Otu03524" +"Otu03529" +"Otu03539" +"Otu03540" +"Otu03549" +"Otu03550" +"Otu03553" +"Otu03562" +"Otu03565" +"Otu03569" +"Otu03577" +"Otu03585" +"Otu03586" +"Otu03588" +"Otu03597" +"Otu03598" +"Otu03606" +"Otu03611" +"Otu03612" +"Otu03613" +"Otu03614" +"Otu03616" +"Otu03620" +"Otu03622" +"Otu03623" +"Otu03631" +"Otu03634" +"Otu03648" +"Otu03649" +"Otu03654" +"Otu03656" +"Otu03659" +"Otu03666" +"Otu03667" +"Otu03669" +"Otu03673" +"Otu03686" +"Otu03693" +"Otu03706" +"Otu03707" +"Otu03713" +"Otu03721" +"Otu03724" +"Otu03726" +"Otu03727" +"Otu03740" +"Otu03742" +"Otu03745" +"Otu03747" +"Otu03754" +"Otu03757" +"Otu03764" +"Otu03775" +"Otu03780" +"Otu03793" +"Otu03796" +"Otu03802" +"Otu03805" +"Otu03825" +"Otu03829" +"Otu03830" +"Otu03832" +"Otu03833" +"Otu03837" +"Otu03848" +"Otu03857" +"Otu03874" +"Otu03875" +"Otu03876" +"Otu03878" +"Otu03880" +"Otu03881" +"Otu03882" +"Otu03886" +"Otu03893" +"Otu03900" +"Otu03901" +"Otu03904" +"Otu03921" +"Otu03926" +"Otu03927" +"Otu03937" +"Otu03943" +"Otu03944" +"Otu03946" +"Otu03950" +"Otu03959" +"Otu03960" +"Otu03964" +"Otu03970" +"Otu03994" +"Otu03995" +"Otu03998" +"Otu04003" +"Otu04006" +"Otu04007" +"Otu04014" +"Otu04017" +"Otu04033" +"Otu04034" +"Otu04037" +"Otu04057" +"Otu04059" +"Otu04063" +"Otu04079" +"Otu04084" +"Otu04089" +"Otu04090" +"Otu04094" +"Otu04101" +"Otu04110" +"Otu04122" +"Otu04123" +"Otu04126" +"Otu04132" +"Otu04138" +"Otu04150" +"Otu04156" +"Otu04158" +"Otu04159" +"Otu04160" +"Otu04161" +"Otu04165" +"Otu04172" +"Otu04174" +"Otu04175" +"Otu04179" +"Otu04180" +"Otu04182" +"Otu04188" +"Otu04189" +"Otu04190" +"Otu04194" +"Otu04196" +"Otu04197" +"Otu04200" +"Otu04201" +"Otu04203" +"Otu04204" +"Otu04212" +"Otu04213" +"Otu04218" +"Otu04222" +"Otu04223" +"Otu04224" +"Otu04229" +"Otu04238" +"Otu04240" +"Otu04242" +"Otu04243" +"Otu04244" +"Otu04246" +"Otu04248" +"Otu04249" +"Otu04250" +"Otu04251" +"Otu04252" +"Otu04261" +"Otu04264" +"Otu04270" +"Otu04289" +"Otu04297" +"Otu04305" +"Otu04312" +"Otu04314" +"Otu04321" +"Otu04329" +"Otu04333" +"Otu04339" +"Otu04352" +"Otu04367" +"Otu04384" +"Otu04387" +"Otu04388" +"Otu04393" +"Otu04400" +"Otu04410" +"Otu04428" +"Otu04441" +"Otu04447" +"Otu04448" +"Otu04451" +"Otu04452" +"Otu04458" +"Otu04460" +"Otu04469" +"Otu04476" +"Otu04477" +"Otu04485" +"Otu04493" +"Otu04499" +"Otu04514" +"Otu04528" +"Otu04529" +"Otu04541" +"Otu04542" +"Otu04550" +"Otu04551" +"Otu04556" +"Otu04574" +"Otu04588" +"Otu04602" +"Otu04603" +"Otu04604" +"Otu04607" +"Otu04618" +"Otu04621" +"Otu04623" +"Otu04632" +"Otu04635" +"Otu04643" +"Otu04644" +"Otu04646" +"Otu04647" +"Otu04648" +"Otu04651" +"Otu04677" +"Otu04680" +"Otu04681" +"Otu04682" +"Otu04683" +"Otu04687" +"Otu04695" +"Otu04707" +"Otu04708" +"Otu04710" +"Otu04714" +"Otu04716" +"Otu04720" +"Otu04721" +"Otu04734" +"Otu04740" +"Otu04744" +"Otu04755" +"Otu04757" +"Otu04762" +"Otu04768" +"Otu04782" +"Otu04790" +"Otu04808" +"Otu04822" +"Otu04832" +"Otu04834" +"Otu04835" +"Otu04847" +"Otu04856" +"Otu04867" +"Otu04888" +"Otu04902" +"Otu04905" +"Otu04915" +"Otu04920" +"Otu04921" +"Otu04945" +"Otu04985" +"Otu04987" +"Otu05003" +"Otu05004" +"Otu05050" +"Otu05051" +"Otu05052" +"Otu05087" +"Otu05091" +"Otu05093" +"Otu05098" +"Otu05112" +"Otu05115" +"Otu05124" +"Otu05129" +"Otu05136" +"Otu05150" +"Otu05165" +"Otu05167" +"Otu05168" +"Otu05182" +"Otu05184" +"Otu05185" +"Otu05187" +"Otu05218" +"Otu05220" +"Otu05221" +"Otu05308" +"Otu05521" +"Otu05522" +"Otu05523" +"Otu05524" +"Otu05548" +"Otu05549" +"Otu05582" +"Otu05613" +"Otu05703" +"Otu05819" +"Otu05820" +"Otu05921" +"Otu05922" +"Otu05923" +"Otu05924" +"Otu05926" +"Otu05928" +"Otu05929" +"Otu05931" +"Otu05947" +"Otu05954" +"Otu05964" +"Otu05965" +"Otu05967" +"Otu05968" +"Otu05969" +"Otu05970" +"Otu05973" +"Otu05974" +"Otu05978" +"Otu05981" +"Otu05992" +"Otu06055" +"Otu06104" +"Otu06131" +"Otu06146" +"Otu06148" +"Otu06151" +"Otu06152" +"Otu06188" +"Otu06195" +"Otu06201" +"Otu06232" +"Otu06243" +"Otu06290" diff --git a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R.html b/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R.html deleted file mode 100755 index 5a461e1..0000000 --- a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R.html +++ /dev/null @@ -1,2419 +0,0 @@ - - - - - - - - - - - - - - - -Microbiota Analysis in R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - -
-
-
-
-
- -
- - - - - - - -

Updated April 5, 2017

-

Online version available at http://rpubs.com/dillmcfarlan/R_microbiotaSOP

-
-

Tips for this workshop

-
    -
  1. If you have any issues in R, type ??command into the console where “command” is the function you are having issues with and a help page will come up.
  2. -
  3. Lines starting with # are comments that are for the reader’s benefit. These lines are not code and do not need to be entered into the console.
  4. -
  5. GREY boxes contain code that you can copy and paste to run on your machine.
  6. -
-
#GREY box
-
    -
  1. WHITE boxes contain sample output of this code, and nothing will happen if you try to copy it into your console.

    -
    WHITE box
  2. -
  3. Basic R code you may find useful: -
      -
    1. Matrices/data frames are designated by [ , ] where it is [rows, columns]
    2. -
    3. | is or
    4. -
    5. & is and
    6. -
  4. -
-
-
-

Introduction

-

Written for R v3.3.2 in RStudio v1.0.136

-
-

Goal

-

The goal of this tutorial is to demonstrate basic analyses of microbiota data to determine if and how communities differ by variables of interest. In general, this pipeline can be used for any microbiota data set that has been clustered into operational taxonomic units (OTUs).

-

This tutorial assumes some basic statistical knowledge. Please consider if your data fit the assumptions of each test (normality? equal sampling? Etc.). If you are not familiar with statistics at this level, we strongly recommend collaborating with someone who is. The incorrect use of statistics is a pervasive and serious problem in the sciences so don’t become part of the problem! That said, this is an introductory tutorial and there are many, many further analyses that can be done with microbiota data. Hopefully, this is just the start for your data!

-
-
-

Data

-

The data used here were created using 2x250 bp amplicon sequencing of the bacterial V4 region of the 16S rRNA gene on the Illumina MiSeq platform. The full data set is in Dill-McFarland et al. Sci Rep 7: 40864. Here, we will use a subset of samples. Specifically, we will be correlating the fecal bacterial microbiota of 8 dairy calves at different ages (2 weeks, 8 weeks, 1 year) to variables like weight gain (average daily gain in kg, ADGKG) and gastrointestinal short chain fatty acids (SCFA).

-
-
-

Files

-

We will use the following files created using the Microbiota Processing in mothur: Standard Operating Procedure (SOP).

-
    -
  • example.final.nn.unique_list.0.03.norm.shared (OTU table)
  • -
  • example.final.nn.unique_list.0.03.cons.taxonomy (Taxonomy of OTUs)
  • -
-

We will also be using tab-delimited metadata and SCFA files created in Excel. The metadata includes our metadata (like age and ADGKG) as well as alpha-diversity metrics from example.final.nn.unique_list.0.03.norm.groups.summary calculated in mothur. The SCFA table is the mM concentrations of different SCFAs in rumen (stomach) liquids from 1-year-old animals.

-
    -
  • example.metadata.txt
  • -
  • example.SCFA.txt
  • -
-

Finally, we will be loading a number of custom scripts from Steinberger_scripts and some a pre-calculated OTU tree NJ.tree.RData. The information for creating this tree is provided in this tutorial.

-
-
-
-

Get set up

-
-

Download and install

-
    -
  • Base R: http://cran.mtu.edu/
  • -
  • RStudio: https://www.rstudio.com/products/rstudio/download3/
  • -
  • Packages: Open RStudio on your computer. If you have not already downloaded these packages, go to the lower right quadrant of your screen and open the Package tab. Click “download” and search for the package you want to download. -
      -
    • tidyr
    • -
    • dplyr
    • -
    • vegan
    • -
    • ape
    • -
    • ggplot2
    • -
    • gplots
    • -
    • plotly
    • -
    • phangorn
    • -
    • phyloseq (phyloseq is not on CRAN, so we have to call it manually. See below.)
    • -
  • -
-

Copy and paste the following into your console.

-
source("https://bioconductor.org/biocLite.R")
-
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
-
biocLite("phyloseq")
-
## BioC_mirror: https://bioconductor.org
-
## Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.2 (2016-10-31).
-
## Installing package(s) 'phyloseq'
-
## package 'phyloseq' successfully unpacked and MD5 sums checked
-## 
-## The downloaded binary packages are in
-##  C:\Users\suenlab\AppData\Local\Temp\Rtmp6Zng0T\downloaded_packages
-
## Old packages: 'ade4', 'curl', 'DBI', 'IRanges', 'phangorn', 'psych',
-##   'Rcpp', 'readr', 'rmarkdown', 'S4Vectors', 'scatterplot3d', 'shiny',
-##   'stringi', 'survival', 'tibble', 'viridis', 'viridisLite', 'XVector',
-##   'cluster', 'lattice'
-

Note: If you are having trouble installing packages, turn off your computer’s firewall temporarily.

-
-
-

Organization

-

All of our analyses will be organized into a “Project”.

-

Make a new project by selecting File->New project. Select “New Directory” and “Empty Project”. Name the project “Microbiota_Analysis_BRC” and save the project to your Desktop. Place all of your files for this analysis in the folder created on the Desktop

-

Create a new R script (File->New file->R script) to save your code. This file will automatically be saved in the project folder.

-

Now your screen should look like this

-
    -
  • Upper left: Where you type and save the code you want to run.
  • -
  • Upper right: Files you load into and create in R. To view one, click on it and it will open in the upper left pane.
  • -
  • Lower left: The console. Where commands and outputs run (similar to the one mothur window).
  • -
  • Lower right: Variable. Explore the different tabs.
  • -
-
-
-
-

Data manipulation

-
-

Load Packages

-

The “library” command tells R to open the package you want to use. You need to do this every time you open R.

-
#This package will help us more easily manipulate our data, which are matrices
-library(tidyr)
-
-#This package will also help us more easily manipulate our data
-library(dplyr)
-
## 
-## Attaching package: 'dplyr'
-
## The following objects are masked from 'package:stats':
-## 
-##     filter, lag
-
## The following objects are masked from 'package:base':
-## 
-##     intersect, setdiff, setequal, union
-
#The vegan package provides tools for descriptive community ecology. It has most basic functions of diversity analysis, community ordination and dissimilarity analysis. In general, this package is used for Bray-Curtis and Jaccard analyses.
-library(vegan)
-
## Loading required package: permute
-
## Loading required package: lattice
-
## This is vegan 2.4-2
-
#The phyloseq package seeks to address issues with multiple microbiome analysis packages by providing a set of functions that internally manage the organizing, linking, storing, and analyzing of phylogenetic sequencing data. In general, this package is used for UniFrac analyses.
-library(phyloseq)
-
-#Analyses of Phylogenetics and Evolution package. Required for tree calculations to be used with phyloseq
-library(ape)
-
-#Graphing package used in phyloseq. To edit the default setting of a plot, you need to use functions in this package.
-library(ggplot2)
-
-#This package is used to calculate and plot Venn diagrams as well as heatmaps
-library(gplots)
-
## 
-## Attaching package: 'gplots'
-
## The following object is masked from 'package:stats':
-## 
-##     lowess
-
#A package to create interactive web graphics of use in 3D plots
-library(plotly)
-
## 
-## Attaching package: 'plotly'
-
## The following object is masked from 'package:ggplot2':
-## 
-##     last_plot
-
## The following object is masked from 'package:stats':
-## 
-##     filter
-
## The following object is masked from 'package:graphics':
-## 
-##     layout
-
#used to read in mothur-formatted files
-library(phangorn)
-
## 
-## Attaching package: 'phangorn'
-
## The following objects are masked from 'package:vegan':
-## 
-##     diversity, treedist
-
-
-

Load Data

-

In the code, the text before = is what the file will be called in R. Make this short but unique as this is how you will tell R to use this file in later commands.

-
    -
  • header: tells R that the first row is column names, not data
  • -
  • row.names: tells R that the first column is row names, not data
  • -
  • sep: tells R that the data are tab-delimited. If you had a comma-delimited file, you would us sep=","
  • -
-
#OTU table (shared file)
-OTU = read.table("example.final.an.unique_list.0.03.norm.shared", header=TRUE, sep="\t")
-
-#Taxonomy of each OTU
-tax = read.table("example.final.an.unique_list.0.03.cons.taxonomy", header=TRUE, sep="\t")
-
-#Metadata. Since we made this in Excel, not mothur, we can use the "row.names" modifier to automatically name the rows by the values in the first column (sample names)
-meta = read.table("example.metadata.txt", header=TRUE, row.names=1, sep="\t")
-
-#SCFA data
-SCFA = read.table("example.SCFA.txt", header=TRUE, row.names=1, sep="\t")
-
-
-

Clean up the data

-

You can look at your data by clicking on it in the upper-right quadrant “Environment”

-

There are several unneeded columns and incorrect formatting in the tables as they were output by mothur. We will now fix them.

-
-

OTU table

-

We need to use the “Group” column as the row names so that it will match our metadata

-
row.names(OTU) = OTU$Group
-

We then need to remove the “label”, “numOTUs”, and “Group” columns as they are not OTU counts like the rest of the table

-
OTU.clean = OTU[,-which(names(OTU) %in% c("label", "numOtus", "Group"))]
-
-
-

Taxonomy table

-

For the taxonomy table, we name the rows by the OTU #

-
row.names(tax) = tax$OTU
-

Remove all the OTUs that don’t occur in our OTU.clean data set

-
tax.clean = tax[row.names(tax) %in% colnames(OTU.clean),]
-

We then need to separate the “taxonomy” column so that each level (i.e. Domain, Phylum, etc) is in it’s own column. We do this with a special command “separate” from the tidyr package

-
tax.clean = separate(tax.clean, Taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain"), sep=";")
-

Finally, we remove the “Size” and “Strain” columns as well as “OTU” since these are now the row names

-
tax.clean = tax.clean[,-which(names(tax.clean) %in% c("Size", "Strain", "OTU"))]
-
-
-

Metadata and SCFA tables

-

These tables do not require any modification since I created them in Excel exactly as I need them for this R analysis.

-
-
-
-

Order the data

-

To make viewing and using the data easier, we will make sure our tables have samples (rows) in the same order. Since OTU.clean, meta, and SCFA have sample names as row names, we order by these.

-
OTU.clean = OTU.clean[order(row.names(OTU.clean)),]
-meta = meta[order(row.names(meta)),]
-SCFA = SCFA[order(row.names(SCFA)),]
-

Our taxonomy table is already in order from OTU1 to OTUN so we do not need to order it.

-
-
-

Set seed

-

We will be running some processes that rely on the random number generater. To make your analysis reproducible, we set the random seed.

-
set.seed(8765)
-
-
-
-

Alpha-diversity

-

Alpha-diversity is within sample diversity. It is how many different species (OTUs) are in each sample (richness) and how evenly they are distributed (evenness), which together are diversity. Each sample has one value for each metric.

-

This image illustrates richness vs. diversity. Both forests have the same richness (4 tree species) but Community 1 has much more even distribution of the 4 species while Community 2 is dominated by tree species A. This makes Community 1 more diverse than Community 2.

-
-

Explore alpha metrics

-

Now we will start to look at our data. We will first start with alpha-diversity and richness. Let’s plot some common ones here.

-
#Create 2x2 plot environment so that we can see all 4 metrics at once. 
-par(mfrow = c(2, 2))
-
-#Then plot each metric.
-hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10)
-hist(meta$simpson, main="Simpson diversity", xlab="", breaks=10)
-hist(meta$chao, main="Chao richness", xlab="", breaks=15)
-hist(meta$ace, main="ACE richness", xlab="", breaks=15)
-

-

You want the data to be roughly normal so that you can run ANOVA or t-tests. If it is not normally distributed, you will need to consider non-parametric tests such as Kruskal-Wallis.

-

Here, we see that none of the data are normally distributed. This occurs with the subset but not the full data set because I’ve specifically selected samples with divergent alpha metrics. In general, you will see roughly normal data for Shannon’s diversity as well as most richness metrics. Simpson’s diversity, on the other hand, is usually skewed as seen here.

-

So most will use inverse Simpson (1/Simpson) instead. This not only increases normalcy but also makes the output more logical as a higher inverse Simpson value corresponds to higher diversity.

-

Let’s look at inverse Simpson instead.

-
#Create 2x2 plot environment 
-par(mfrow = c(2, 2))
-
-#Plots
-hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10)
-hist(1/meta$simpson, main="Inverse Simpson diversity", xlab="", breaks=10)
-hist(meta$chao, main="Chao richness", xlab="", breaks=15)
-hist(meta$ace, main="ACE richness", xlab="", breaks=15)
-

-

Now we see a bimodal distribution for Simpson similar to the richness metrics.

-

To test for normalcy statistically, we can run the Shapiro-Wilk test of normality.

-
shapiro.test(meta$shannon)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$shannon
-## W = 0.91511, p-value = 0.0456
-
shapiro.test(1/meta$simpson)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  1/meta$simpson
-## W = 0.74821, p-value = 4.69e-05
-
shapiro.test(meta$chao)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$chao
-## W = 0.80636, p-value = 0.0003749
-
shapiro.test(meta$ace)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$ace
-## W = 0.83017, p-value = 0.0009573
-

We see that, as expected from the graphs, none are normal.

-

However, our sample size is small and normalcy tests are very sensitive for small data-sets. In fact, you can run Shapiro-Wilk on a list of 50 values randomly sampled from the R-generated normal distribution and find that they are not normal (even though we know that they are!)

-

So, what does this mean for our purposes? Well, we should run statistical tests that don’t assume our data is normal, because we don’t have any evidence (graphs, Shapiro-Wilk) that it is normal. For demonstration purposes, though, we will run other tests as well.

-

Overall, for alpha-diversity:

-
    -
  • ANOVA, t-test, or general linear models with the normal distribution are used when the data is roughly normal
  • -
  • Kruskal-Wallis, Wilcoxon rank sum test, or general linear models with another distribution are used when the data is not normal
  • -
-

Our main variables of interest are

-
    -
  • AgeGroup: 2w, 8w, 1yr
  • -
  • ADGKG: 0.05-1.56 kg gained per day (average daily gain kg)
  • -
-
-
-

Categorical variables

-

Now that we know which tests can be used, let’s run them.

-

Normally distributed metrics

-

Since it’s the closest to normalcy, we will use Shannon’s diversity as an example. First, we will test age, which is a categorical variable with more than 2 levels. Thus, we run ANOVA. If age were only two levels, we could run a t-test

-

Does age impact the Shannon diversity of the fecal microbiota?

-
#Run the ANOVA and save it as an object
-aov.shannon.age = aov(shannon ~ AgeGroup, data=meta)
-#Call for the summary of that ANOVA, which will include P-values
-summary(aov.shannon.age)
-
##             Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup     2  42.98  21.489   103.4 1.35e-11 ***
-## Residuals   21   4.36   0.208                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

To do all the pairwise comparisons between groups and correct for multiple comparisons, we run Tukey’s honest significance test of our ANOVA.

-
TukeyHSD(aov.shannon.age)
-
##   Tukey multiple comparisons of means
-##     95% family-wise confidence level
-## 
-## Fit: aov(formula = shannon ~ AgeGroup, data = meta)
-## 
-## $AgeGroup
-##             diff        lwr       upr   p adj
-## 2w-1yr -3.270063 -3.8446230 -2.695503 0.0e+00
-## 8w-1yr -1.830903 -2.4054628 -1.256342 2.0e-07
-## 8w-2w   1.439160  0.8646001  2.013720 8.5e-06
-

We clearly see that all age groups have significantly different diversity. When we plot the data, we see that diversity increases as the animals age.

-
#Re-order the groups because the default is 1yr-2w-8w
-meta$AgeGroup.ord = factor(meta$AgeGroup, c("2w","8w","1yr"))
-#Return the plot area to 1x1
-par(mfrow = c(1, 1))
-#Plot
-boxplot(shannon ~ AgeGroup.ord, data=meta, ylab="Shannon's diversity")
-

-

Non-normally distributed metrics

-

We will use Chao’s richness estimate here. Since age is categorical, we use Kruskal-Wallis (non-parametric equivalent of ANOVA). If we have only two levels, we would run Wilcoxon rank sum test (non-parametric equivalent of t-test)

-
kruskal.test(chao ~ AgeGroup, data=meta)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  chao by AgeGroup
-## Kruskal-Wallis chi-squared = 19.28, df = 2, p-value = 6.507e-05
-

We can test pairwise within the age groups with Wilcoxon Rank Sum Tests. This test has a slightly different syntax than our other tests

-
pairwise.wilcox.test(meta$chao, meta$AgeGroup, p.adjust.method="fdr")
-
## 
-##  Pairwise comparisons using Wilcoxon rank sum test 
-## 
-## data:  meta$chao and meta$AgeGroup 
-## 
-##    1yr     2w     
-## 2w 0.00023 -      
-## 8w 0.00023 0.00186
-## 
-## P value adjustment method: fdr
-

Like diversity, we see that richness also increases with age.

-
#Create 1x1 plot environment
-par(mfrow = c(1, 1))
-#Plot
-boxplot(chao ~ AgeGroup.ord, data=meta, ylab="Chao richness")
-

-
-
-

Continuous variables

-

For continuous variables, we use general linear models, specifying the distribution that best fits our data.

-

Normally distributed metrics

-

Since ADG is a continuous variable, we run a general linear model. We will again use Shannon’s diversity as our roughly normal metric. The default of glm and lm is the normal distribution so we don’t have to specify anything.

-

Does ADG impact the Shannon diversity of the fecal microbiota?

-
glm.shannon.ADG = glm(shannon ~ ADGKG, data=meta)
-summary(glm.shannon.ADG)
-
## 
-## Call:
-## glm(formula = shannon ~ ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##      Min        1Q    Median        3Q       Max  
-## -2.49110  -1.11216  -0.01749   1.53658   1.84728  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)   
-## (Intercept)  3.62565    1.01390   3.576  0.00169 **
-## ADGKG       -0.03407    0.97805  -0.035  0.97253   
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 2.151815)
-## 
-##     Null deviance: 47.343  on 23  degrees of freedom
-## Residual deviance: 47.340  on 22  degrees of freedom
-## AIC: 90.412
-## 
-## Number of Fisher Scoring iterations: 2
-

The output let’s us know that the intercept of our model is significantly different from 0 but our slope (e.g. our variable of interest) is not. This makes sense when we look at the data.

-
plot(shannon ~ ADGKG, data=meta)
-#Add the glm best fit line
-abline(glm.shannon.ADG)
-

-

Non-normally distributed metrics

-

We will again use a general linear model for our non-normally distributed metric Chao. However, this time, we change the distribution from normal to something that fits the data better.

-

But which distribution should we choose? In statistics, there is no one “best” model. There are only good and better models. We will use the plot() function to compare two models and pick the better one.

-

First, the Gaussian (normal) distribution, which we already know is a bad fit.

-
gaussian.chao.ADG = glm(chao ~ ADGKG, data=meta, family="gaussian")
-par(mfrow = c(1,2))
-plot(gaussian.chao.ADG, which=c(1,2))
-

-

Quasipoisson (log) distribution

-
qp.chao.ADG = glm(chao ~ ADGKG, data=meta, family="quasipoisson")
-par(mfrow = c(1,2))
-plot(qp.chao.ADG, which=c(1,2))
-

-

What we’re looking for is no pattern in the Residuals vs. Fitted graph (“stars in the sky”), which shows that we picked a good distribution family to fit our data. We also want our residuals to be normally distributed, which is shown by most/all of the points falling on the line in the Normal Q-Q plot.

-

While it’s still not perfect, the quasipoisson fits much better with residuals on the order of 30 whereas gaussian was on the order of 600. So, we will use quasipoisson and see that ADG does not to correlate to Chao richness.

-
summary(qp.chao.ADG)
-
## 
-## Call:
-## glm(formula = chao ~ ADGKG, family = "quasipoisson", data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -24.36  -17.05  -10.66   18.81   26.91  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)   6.4528     0.5561  11.605 7.54e-11 ***
-## ADGKG        -0.1859     0.5438  -0.342    0.736    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 374.2485)
-## 
-##     Null deviance: 8117.2  on 23  degrees of freedom
-## Residual deviance: 8074.4  on 22  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 5
-

Plotting this we see that, indeed, there is not signficant correlation between Chao and ADG.

-
#Return the plot area to 1x1
-par(mfrow = c(1, 1))
-#Plot
-plot(log(chao) ~ ADGKG, data=meta, ylab="ln(Chao's richness)")
-abline(qp.chao.ADG)
-

-
-
-

Mixed models

-

Our two variables may not be fully independent and therefore, running them in two separate tests may not be correct. That is to say, age may impact ADG. In fact, I know this is the case because calves (2w, 8w) gain weight more quickly than heifers (1yr).

-

Think about your variables and what they mean “in the real world.” Logically combine them into as few ANOVA tests as possible. In the end, it’s better to test a meaningless interaction (as it will most likely not be significant) than not test a meaningful one.

-

We can test if the interaction of age and ADG impacts diversity with a model that includes both of our variables. The * symbol is a shortcut for models. A*B is equivalent to A + B + A:B

-
aov.shannon.all = aov(shannon ~ AgeGroup*ADGKG, data=meta)
-summary(aov.shannon.all)
-
##                Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup        2  42.98  21.489  95.472 2.61e-10 ***
-## ADGKG           1   0.05   0.054   0.239    0.631    
-## AgeGroup:ADGKG  2   0.26   0.130   0.576    0.572    
-## Residuals      18   4.05   0.225                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

We can see that the interaction of age and ADG doesn’t significantly impact Shannon diversity, So we should remove that variable to simplify our model. If you had many interaction terms, you would step-wise remove the one with the highest P-value until you had the simplest model with only individual variables and significant interaction terms.

-
aov.shannon.all2 = aov(shannon ~ AgeGroup+ADGKG, data=meta)
-summary(aov.shannon.all2)
-
##             Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup     2  42.98  21.489   99.70 3.96e-11 ***
-## ADGKG        1   0.05   0.054    0.25    0.623    
-## Residuals   20   4.31   0.216                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Overall, the ANOVA test tells us that only age impacts Shannon diversity but it does not tell us which age groups differ from one another. If all of our variables were categorical, we could run TukeyHSD like we did with age only.

-
TukeyHSD(aov.shannon.all)
-
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
-## ADGKG
-
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
-## AgeGroup, ADGKG
-
## Warning in TukeyHSD.aov(aov.shannon.all): 'which' specified some non-
-## factors which will be dropped
-
##   Tukey multiple comparisons of means
-##     95% family-wise confidence level
-## 
-## Fit: aov(formula = shannon ~ AgeGroup * ADGKG, data = meta)
-## 
-## $AgeGroup
-##             diff       lwr       upr    p adj
-## 2w-1yr -3.270063 -3.875469 -2.664657 0.00e+00
-## 8w-1yr -1.830903 -2.436309 -1.225496 1.20e-06
-## 8w-2w   1.439160  0.833754  2.044567 2.81e-05
-

However, you will see that we don’t get any data from ADG since it is continuous. There is an error denoting this as “non-factors ignored: ADGKG”

-

So, we should have run our test as a glm since we have at least one continuous variable. First, we will still include the interaction variable to see that type of output.

-
glm.shannon.all = glm(shannon ~ AgeGroup*ADGKG, data=meta)
-summary(glm.shannon.all)
-
## 
-## Call:
-## glm(formula = shannon ~ AgeGroup * ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##     Min       1Q   Median       3Q      Max  
-## -1.0301  -0.2468   0.0894   0.1572   0.7624  
-## 
-## Coefficients:
-##                  Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)        5.7123     2.5928   2.203   0.0409 *
-## AgeGroup2w        -3.3969     2.6197  -1.297   0.2111  
-## AgeGroup8w        -2.9610     2.7554  -1.075   0.2967  
-## ADGKG             -0.4481     2.7599  -0.162   0.8728  
-## AgeGroup2w:ADGKG   0.1228     2.7848   0.044   0.9653  
-## AgeGroup8w:ADGKG   1.0750     2.8763   0.374   0.7130  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 0.22508)
-## 
-##     Null deviance: 47.3425  on 23  degrees of freedom
-## Residual deviance:  4.0514  on 18  degrees of freedom
-## AIC: 39.413
-## 
-## Number of Fisher Scoring iterations: 2
-

Now this output is saying the same thing as ANOVA but in a more complicated way. The function automatically picks a reference group for categorical variables (in this case, 1yr) to compare all other groups to. Let’s go through each line

-
    -
  • (Intercept) - This is whether or not the y-intercept is 0. A significant P-value indicates that the intercept is not 0, and we wouldn’t expect it to be for any alpha-diversity metric since 0 means nothing is there

  • -
  • AgeGroup2w - the difference between Shannon when Age = 2w vs. 1yr (the same as testing “shannon ~ AgeGroup” and only looking at the 2w-1yr pairwise comparison)
  • -
  • AgeGroup8w - the same as 2w but now looking at only the 8w-1yr comparison

  • -
  • ADGKG - the slope of shannon to ADGKG (the same as testing “shannon ~ ADGKG”)

  • -
  • AgeGroup2w:ADGKG - the difference in slope of shannon ~ ADG between ages 2w and 1yr
  • -
  • AgeGroup8w:ADGKG - the difference in slope of shannon ~ ADG between ages 8w and 1yr

  • -
-

As we saw in ANOVA, none of the interaction terms are significant so we remove them.

-
glm.shannon.all2 = glm(shannon ~ AgeGroup+ADGKG, data=meta)
-summary(glm.shannon.all2)
-
## 
-## Call:
-## glm(formula = shannon ~ AgeGroup + ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##      Min        1Q    Median        3Q       Max  
-## -0.95299  -0.25858   0.07643   0.30409   0.74487  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)   5.4459     0.3487  15.619 1.14e-12 ***
-## AgeGroup2w   -3.2760     0.2324 -14.094 7.55e-12 ***
-## AgeGroup8w   -1.7989     0.2408  -7.471 3.30e-07 ***
-## ADGKG        -0.1639     0.3281  -0.500    0.623    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 0.2155447)
-## 
-##     Null deviance: 47.3425  on 23  degrees of freedom
-## Residual deviance:  4.3109  on 20  degrees of freedom
-## AIC: 36.903
-## 
-## Number of Fisher Scoring iterations: 2
-

Note: The full glm model with the interaction term included did not show age as significant. When we remove the interaction term, age is significant. This is why you should remove non-significant interactions terms as they can the mask main effects of individual variables.

-

We can run a similar test with non-normal data like Chao.

-
qp.chao.all = glm(chao ~ AgeGroup*ADGKG, data=meta, family="quasipoisson")
-summary(qp.chao.all)
-
## 
-## Call:
-## glm(formula = chao ~ AgeGroup * ADGKG, family = "quasipoisson", 
-##     data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -7.774  -3.430  -0.140   3.692   5.277  
-## 
-## Coefficients:
-##                  Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)       6.99825    0.71122   9.840 1.14e-08 ***
-## AgeGroup2w       -1.61539    0.75272  -2.146   0.0458 *  
-## AgeGroup8w       -2.24498    0.86846  -2.585   0.0187 *  
-## ADGKG             0.01751    0.75699   0.023   0.9818    
-## AgeGroup2w:ADGKG -0.42295    0.80094  -0.528   0.6039    
-## AgeGroup8w:ADGKG  0.86269    0.86550   0.997   0.3321    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 18.86331)
-## 
-##     Null deviance: 8117.2  on 23  degrees of freedom
-## Residual deviance:  348.5  on 18  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 4
-

Remove the non-significant interaction.

-
qp.chao.all2 = glm(chao ~ AgeGroup+ADGKG, data=meta, family="quasipoisson")
-summary(qp.chao.all2)
-
## 
-## Call:
-## glm(formula = chao ~ AgeGroup + ADGKG, family = "quasipoisson", 
-##     data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -7.783  -3.452  -1.378   3.744   8.184  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)  7.03944    0.23567  29.870  < 2e-16 ***
-## AgeGroup2w  -1.98090    0.14862 -13.329 2.08e-11 ***
-## AgeGroup8w  -1.24286    0.11926 -10.422 1.57e-09 ***
-## ADGKG       -0.02643    0.24530  -0.108    0.915    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 23.74583)
-## 
-##     Null deviance: 8117.20  on 23  degrees of freedom
-## Residual deviance:  476.31  on 20  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 4
-

From all of this, we can conclude that the fecal microbiota increases in diversity and richness as dairy cows age. Animal growth as measured by ADG does not correlate with fecal community diversity or richness.

-
-
-
-

Beta-diversity

-

Beta-diversity is between sample diversity. It is how different every sample is from every other sample. Thus, each sample has more than one value. Some metrics take abundance into account (i.e. diversity: Bray-Curtis, weighted UniFrac) and some only calculate based on presence-absence (i.e. richness: Jaccard, unweighted UniFrac).

-

Beta-diversity appears like the following (completely made-up numbers)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
.sample1sample2sample3
sample100.3450.194
sample20.34500.987
sample30.1940.9870
-
-

Visualization

-

The best way to visualize beta-diversity, or how different samples are from each other, is by non-metric multidimensional scaling (nMDS). This is similar to principle coordinate analysis or PCA/PCoA if you’ve heard of that, only nMDS is more statistically robust with multiple iterations in the form of the trymax part of the command.

-

Each symbol on an nMDS plot represents the total microbial community of that sample. Symbols closer together have more similar microbiotas while those farther apart have less similar.

-
-

OTU-based metrics

-

There are two main type of beta-diversity measures. These OTU-based metrics treat every OTU as a separate entity without taking taxonomy into account. The distance between Prevotella OTU1 and Prevotella OTU2 is equivalent to the distance between Prevotella OTU1 and Bacteroides OTU1.

-
-

Dot plots

-

First, we calculate the nMDS values for a 2-axis k=2 graph using the OTU-based Bray-Curtis metric that takes into account both the presence/absence and abundance of OTUs in your samples (i.e. diversity). This uses the metaMDS function from the package vegan.

-
BC.nmds = metaMDS(OTU.clean, distance="bray", k=2, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.06208161 
-## Run 1 stress 0.06210668 
-## ... Procrustes: rmse 0.001636313  max resid 0.005662513 
-## ... Similar to previous best
-## Run 2 stress 0.06208261 
-## ... Procrustes: rmse 0.0008174643  max resid 0.00186259 
-## ... Similar to previous best
-## Run 3 stress 0.06208133 
-## ... New best solution
-## ... Procrustes: rmse 0.000495613  max resid 0.001143981 
-## ... Similar to previous best
-## Run 4 stress 0.06208228 
-## ... Procrustes: rmse 0.0002768028  max resid 0.0006083455 
-## ... Similar to previous best
-## Run 5 stress 0.06208254 
-## ... Procrustes: rmse 0.0003377152  max resid 0.0007457908 
-## ... Similar to previous best
-## Run 6 stress 0.06208233 
-## ... Procrustes: rmse 0.000285801  max resid 0.000626649 
-## ... Similar to previous best
-## Run 7 stress 0.06210685 
-## ... Procrustes: rmse 0.001453303  max resid 0.005539077 
-## ... Similar to previous best
-## Run 8 stress 0.062104 
-## ... Procrustes: rmse 0.001430176  max resid 0.005147467 
-## ... Similar to previous best
-## Run 9 stress 0.06208351 
-## ... Procrustes: rmse 0.0005018534  max resid 0.00111944 
-## ... Similar to previous best
-## Run 10 stress 0.06208269 
-## ... Procrustes: rmse 0.0003614257  max resid 0.0008024269 
-## ... Similar to previous best
-## Run 11 stress 0.06208154 
-## ... Procrustes: rmse 0.0004861021  max resid 0.001120926 
-## ... Similar to previous best
-## Run 12 stress 0.06212707 
-## ... Procrustes: rmse 0.001859292  max resid 0.005339963 
-## ... Similar to previous best
-## Run 13 stress 0.3702005 
-## Run 14 stress 0.06210406 
-## ... Procrustes: rmse 0.001425256  max resid 0.00512563 
-## ... Similar to previous best
-## Run 15 stress 0.06208142 
-## ... Procrustes: rmse 3.189023e-05  max resid 6.612762e-05 
-## ... Similar to previous best
-## Run 16 stress 0.06210429 
-## ... Procrustes: rmse 0.001578454  max resid 0.005195898 
-## ... Similar to previous best
-## Run 17 stress 0.06210796 
-## ... Procrustes: rmse 0.00155285  max resid 0.005626229 
-## ... Similar to previous best
-## Run 18 stress 0.06208191 
-## ... Procrustes: rmse 0.0001981339  max resid 0.0004391198 
-## ... Similar to previous best
-## Run 19 stress 0.06208168 
-## ... Procrustes: rmse 0.0001331311  max resid 0.000291077 
-## ... Similar to previous best
-## Run 20 stress 0.06210592 
-## ... Procrustes: rmse 0.001396183  max resid 0.005412384 
-## ... Similar to previous best
-## *** Solution reached
-

We see that we reached a convergent solution around 20 iterations and our stress is very low (0.06), meaning that 2-axis are sufficient to view the data.

-

Then plot the nMDS with different colors for your different groups of interest. We will use colors for our three ages

-
par(mfrow = c(1, 1))
-#Create a blank plot for the nmds
-plot(BC.nmds, type="n", main="Bray-Curtis")
-#Add the points colored by age
-points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-#Add a legend
-legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

This will create a plot in the lower right quadrant. If you want to get fancy, type “?plot” in the console to see other ways to modify the plot function.

-

A similar thing can be done for the Jaccard metric, which only takes into account presence/absence (i.e. richness).

-
J.nmds = metaMDS(OTU.clean, distance="jaccard", k=2, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.0620818 
-## Run 1 stress 0.06208178 
-## ... New best solution
-## ... Procrustes: rmse 0.0007016851  max resid 0.001623036 
-## ... Similar to previous best
-## Run 2 stress 0.06210633 
-## ... Procrustes: rmse 0.001409348  max resid 0.005467011 
-## ... Similar to previous best
-## Run 3 stress 0.06210745 
-## ... Procrustes: rmse 0.001470069  max resid 0.00557513 
-## ... Similar to previous best
-## Run 4 stress 0.06208144 
-## ... New best solution
-## ... Procrustes: rmse 0.0001309513  max resid 0.0002717662 
-## ... Similar to previous best
-## Run 5 stress 0.06208156 
-## ... Procrustes: rmse 5.349512e-05  max resid 0.0001195792 
-## ... Similar to previous best
-## Run 6 stress 0.06208137 
-## ... New best solution
-## ... Procrustes: rmse 2.027381e-05  max resid 4.710602e-05 
-## ... Similar to previous best
-## Run 7 stress 0.06208345 
-## ... Procrustes: rmse 0.0004560942  max resid 0.001010311 
-## ... Similar to previous best
-## Run 8 stress 0.06210681 
-## ... Procrustes: rmse 0.001448074  max resid 0.005531499 
-## ... Similar to previous best
-## Run 9 stress 0.06208334 
-## ... Procrustes: rmse 0.0004470347  max resid 0.000984174 
-## ... Similar to previous best
-## Run 10 stress 0.06208155 
-## ... Procrustes: rmse 7.705878e-05  max resid 0.0001651192 
-## ... Similar to previous best
-## Run 11 stress 0.06208217 
-## ... Procrustes: rmse 0.0002412108  max resid 0.0005340427 
-## ... Similar to previous best
-## Run 12 stress 0.06210429 
-## ... Procrustes: rmse 0.001420012  max resid 0.005133791 
-## ... Similar to previous best
-## Run 13 stress 0.06208263 
-## ... Procrustes: rmse 0.0002884997  max resid 0.0006395557 
-## ... Similar to previous best
-## Run 14 stress 0.06208166 
-## ... Procrustes: rmse 0.0001135875  max resid 0.0002424163 
-## ... Similar to previous best
-## Run 15 stress 0.06210651 
-## ... Procrustes: rmse 0.001438738  max resid 0.005503184 
-## ... Similar to previous best
-## Run 16 stress 0.06208137 
-## ... New best solution
-## ... Procrustes: rmse 6.516686e-05  max resid 0.0001605969 
-## ... Similar to previous best
-## Run 17 stress 0.06208244 
-## ... Procrustes: rmse 0.0002976643  max resid 0.0007159927 
-## ... Similar to previous best
-## Run 18 stress 0.06208222 
-## ... Procrustes: rmse 0.0002618419  max resid 0.0006358936 
-## ... Similar to previous best
-## Run 19 stress 0.06208197 
-## ... Procrustes: rmse 0.000208525  max resid 0.0005678922 
-## ... Similar to previous best
-## Run 20 stress 0.0620832 
-## ... Procrustes: rmse 0.0004189108  max resid 0.0009707012 
-## ... Similar to previous best
-## *** Solution reached
-
plot(J.nmds, type="n", main="Jaccard")
-points(J.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-3, 1.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

You see that the values are very different for Jaccard but the pattern of points is very similar to Bray-Curtis. This is because Jaccard is a transformation of Bray-Curtis with J = 2BC/(1+BC)

-
-
-

Ellipses

-

You can also plot standard error (se) ellipses for your nmds data instead of showing all of the individual points. Here, we will plot 99% confidence se ellipses for the Bray-Curtis metric using ordiellipse from vegan.

-

Code courtesy of Madison Cox.

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-
-#Add an ellipse for 2w
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-
-#Add an ellipse for 8w
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-
-#Add an ellipse for 1yr
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

We clearly see in both the dot and ellipse plots that age significantly impacts the overall structure (Bray-Curtis) and composition (Jaccard) of the fecal bacterial microbiota.

-
-
-

3D plots

-

If your stress is high (like over 0.3) for your metaMDS calculation, you probably need to increase to 3 axes k=3. Graphing a 3D plot is much more complicated, and there are a number of packages that could be used. Here, we will use one option from the plotly package to visualize a 3D Bray-Curtis plot.

-
#Calculate the Bray-Curtis nMDS for 3-axis
-BC.nmds.3D = metaMDS(OTU.clean, distance="bray", k=3, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.04686346 
-## Run 1 stress 0.04741659 
-## Run 2 stress 0.04673425 
-## ... New best solution
-## ... Procrustes: rmse 0.01073904  max resid 0.0344814 
-## Run 3 stress 0.05061835 
-## Run 4 stress 0.04740131 
-## Run 5 stress 0.04984642 
-## Run 6 stress 0.04747801 
-## Run 7 stress 0.05226505 
-## Run 8 stress 0.05295437 
-## Run 9 stress 0.04741387 
-## Run 10 stress 0.0457586 
-## ... New best solution
-## ... Procrustes: rmse 0.03868237  max resid 0.1296728 
-## Run 11 stress 0.05094992 
-## Run 12 stress 0.04719303 
-## Run 13 stress 0.05012352 
-## Run 14 stress 0.04750204 
-## Run 15 stress 0.0479423 
-## Run 16 stress 0.04579561 
-## ... Procrustes: rmse 0.004692476  max resid 0.01495666 
-## Run 17 stress 0.05069634 
-## Run 18 stress 0.0485804 
-## Run 19 stress 0.05058189 
-## Run 20 stress 0.04859459 
-## Run 21 stress 0.04996713 
-## Run 22 stress 0.04740079 
-## Run 23 stress 0.04747632 
-## Run 24 stress 0.04675455 
-## Run 25 stress 0.04747574 
-## Run 26 stress 0.0486171 
-## Run 27 stress 0.04575823 
-## ... New best solution
-## ... Procrustes: rmse 0.0005374711  max resid 0.0008831403 
-## ... Similar to previous best
-## *** Solution reached
-

Extract x-y-z values for this nmds

-
BCxyz = scores(BC.nmds.3D, display="sites")
-#This is a table that looks like 
-BCxyz
-
##                 NMDS1       NMDS2        NMDS3
-## 5017.1yr.F -4.7973931  0.33029806 -0.211481225
-## 5017.2w.F   3.1867260  0.06208276  1.484970505
-## 5017.8w.F   1.0614871 -2.13025264 -1.218243774
-## 5020.1yr.F -4.7579235  0.24440345 -0.002888360
-## 5020.2w.F   3.4979230 -1.00981047  1.015200903
-## 5020.8w.F   1.5897780 -1.93435391  0.464128291
-## 5026.1yr.F -4.7720517  0.20611823  0.214815994
-## 5026.2w.F   3.3976411  1.10010056 -0.616957559
-## 5026.8w.F   3.1483050  2.07715934  1.478767471
-## 5031.1yr.F -4.8021402  0.44250394  0.202447638
-## 5031.2w.F   3.3537430  0.48376070 -1.490408346
-## 5031.8w.F   0.8577869 -1.64300786  0.250766536
-## 5037.1yr.F -4.8522745  0.48898068 -0.004218580
-## 5037.2w.F   3.6593056  0.26886383 -0.507062657
-## 5037.8w.F   3.1326413 -0.82210579 -0.024946820
-## 5041.1yr.F -4.7724198  0.28335210  0.060469429
-## 5041.2w.F   3.1661815  2.43615798 -1.218459457
-## 5041.8w.F   1.0947996 -2.58325770 -0.236659085
-## 5045.1yr.F -4.7522029  0.16444286  0.004405471
-## 5045.2w.F   1.5110480  3.11956405 -0.469494555
-## 5045.8w.F   1.4900615 -2.17087166 -0.450930039
-## 5053.1yr.F -4.8259682  0.39929033 -0.016428020
-## 5053.2w.F   3.2932453  2.30299477  0.813801957
-## 5053.8w.F   0.8917011 -2.11641360  0.478404284
-

Plot the xyz coordinates and color by age

-
plot_ly(x=BCxyz[,1], y=BCxyz[,2], z=BCxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red"))
-
- -

Note: Since 3D plots are difficult to interpret in printed journal articles, many authors choose to create two separate 2D plots to show the 3D data like so.

-
par(mfrow=c(1,2))
-#Axis 1 and 2 (x and y)
-plot(BCxyz[,1], BCxyz[,2], main="Bray-Curtis 1:2", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-5.4, 3, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Axis 1 and 3 (x and z)
-plot(BCxyz[,1], BCxyz[,3], main="Bray-Curtis 1:3", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-

-
-
-
-

Phylogentic-based metrics

-

The most common of this type of beta-diversity metrics is UniFrac. The strength of UniFrac over Bray-Curtis or Jaccard is that it takes into account phylogenetic relationships of the species present in the microbiota. Thus, samples with different OTUs from the same genus will be more similar by UniFrac that those with OTUs from different genera. The weakness is that UniFrac is more sensitive to low abundance OTUs and those that a very phylogenetically distant.

-

Your choice will depend on how much you personally feel phylogenetic relationships vs. sensitively matter in your data.

-

Just as above, UniFrac can be plotted as an nMDS. You just need to use a different R package, and thus, slightly different commands.

-
-

Create physeq object

-

To start, you must make a phyloseq object which includes the OTU.clean, meta, and tax.clean data. We tell R which tables are each type

-
OTU.UF = otu_table(as.matrix(OTU.clean), taxa_are_rows=FALSE)
-tax.UF = tax_table(as.matrix(tax.clean))
-meta.UF = sample_data(meta)
-

We then merge these into an object of class phyloseq.

-
physeq = phyloseq(OTU.UF, tax.UF, meta.UF)
-

To add the phylogenetic component to UniFrac, we calculate a rooted phylogenetic tree of our OTUs. This takes a long time so we have provided the tree for you.

-

However, if we were to calculate a tree, first, we import a distance matrix created from representative sequences of our OTUs. We would use phangorn to read the file as it was created in mothur as seen under “Trees of OTUs” here.

-

DO NOT RUN THIS

-
dist.mat = import_mothur_dist("clean_repFasta.phylip.dist")
-

We would then calculate a rooted neighbor-joining tree from the distance matrix using the ape package.

-

DO NOT RUN THIS

-
NJ.tree = bionj(dist.mat)
-

Instead, we have pre-calculated this tree and you can load is with

-
load("NJ.tree.Rdata")
-

Then, add this tree to your physeq object. This object will be what is used in UniFrac calculations.

-
physeq.tree = merge_phyloseq(physeq, NJ.tree)
-

We can look at this object and see its components.

-
physeq.tree
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 5002 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 5002 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 5002 tips and 5000 internal nodes ]
-
-
-

Dot plots

-

Calculate weighted UniFrac (i.e. diversity) distances and ordinate into an nMDS. We specify weighted with weighted=TRUE.

-
wUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=TRUE)
-
## Warning in UniFrac(physeq, ...): Randomly assigning root as -- Otu00062 --
-## in the phylogenetic tree in the data you provided.
-
## Run 0 stress 0.0864543 
-## Run 1 stress 0.08645377 
-## ... New best solution
-## ... Procrustes: rmse 0.0001213931  max resid 0.0003141587 
-## ... Similar to previous best
-## Run 2 stress 0.1335727 
-## Run 3 stress 0.1463023 
-## Run 4 stress 0.08645329 
-## ... New best solution
-## ... Procrustes: rmse 0.0007206919  max resid 0.001920389 
-## ... Similar to previous best
-## Run 5 stress 0.1270238 
-## Run 6 stress 0.1157455 
-## Run 7 stress 0.1143571 
-## Run 8 stress 0.1317677 
-## Run 9 stress 0.08645345 
-## ... Procrustes: rmse 5.804039e-05  max resid 0.0001620988 
-## ... Similar to previous best
-## Run 10 stress 0.08808605 
-## Run 11 stress 0.08645348 
-## ... Procrustes: rmse 0.000642139  max resid 0.001706552 
-## ... Similar to previous best
-## Run 12 stress 0.1157451 
-## Run 13 stress 0.0864534 
-## ... Procrustes: rmse 4.051435e-05  max resid 0.0001125382 
-## ... Similar to previous best
-## Run 14 stress 0.1143564 
-## Run 15 stress 0.08659435 
-## ... Procrustes: rmse 0.004251655  max resid 0.01804703 
-## Run 16 stress 0.1295296 
-## Run 17 stress 0.0864538 
-## ... Procrustes: rmse 0.000161137  max resid 0.0004585026 
-## ... Similar to previous best
-## Run 18 stress 0.1347981 
-## Run 19 stress 0.08645297 
-## ... New best solution
-## ... Procrustes: rmse 0.0003657154  max resid 0.0008934259 
-## ... Similar to previous best
-## Run 20 stress 0.08808625 
-## *** Solution reached
-

You can plot UniFrac nMDS using the basic plot function as we’ve done before.

-
par(mfrow=c(1,1))
-plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(0.3,0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

But let’s also look at the ggplot2 package. This package is incredibly powerful and can be customized in many ways. This document has many helpful tips.

-
plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  ggtitle("Weighted UniFrac")
-

-

Unweighted UniFrac (i.e. richness) can be visualized in the same way. We specify unweighted with weighted=FALSE.

-
uwUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=FALSE)
-
## Warning in UniFrac(physeq, ...): Randomly assigning root as -- Otu00541 --
-## in the phylogenetic tree in the data you provided.
-
## Run 0 stress 9.695153e-05 
-## Run 1 stress 9.657832e-05 
-## ... New best solution
-## ... Procrustes: rmse 7.750783e-05  max resid 0.0002776914 
-## ... Similar to previous best
-## Run 2 stress 9.871795e-05 
-## ... Procrustes: rmse 8.086551e-05  max resid 0.0002819207 
-## ... Similar to previous best
-## Run 3 stress 9.488623e-05 
-## ... New best solution
-## ... Procrustes: rmse 7.261501e-05  max resid 0.0002642816 
-## ... Similar to previous best
-## Run 4 stress 9.862006e-05 
-## ... Procrustes: rmse 1.701217e-05  max resid 5.025527e-05 
-## ... Similar to previous best
-## Run 5 stress 9.806631e-05 
-## ... Procrustes: rmse 0.0001070473  max resid 0.0002353732 
-## ... Similar to previous best
-## Run 6 stress 9.757454e-05 
-## ... Procrustes: rmse 3.985665e-05  max resid 0.0001388531 
-## ... Similar to previous best
-## Run 7 stress 9.826177e-05 
-## ... Procrustes: rmse 9.722135e-05  max resid 0.0002191936 
-## ... Similar to previous best
-## Run 8 stress 9.695708e-05 
-## ... Procrustes: rmse 7.448687e-05  max resid 0.0002751687 
-## ... Similar to previous best
-## Run 9 stress 9.907648e-05 
-## ... Procrustes: rmse 9.310993e-05  max resid 0.0002388289 
-## ... Similar to previous best
-## Run 10 stress 9.984534e-05 
-## ... Procrustes: rmse 3.384419e-05  max resid 0.0001260377 
-## ... Similar to previous best
-## Run 11 stress 9.684607e-05 
-## ... Procrustes: rmse 0.0001319037  max resid 0.0003356478 
-## ... Similar to previous best
-## Run 12 stress 9.69891e-05 
-## ... Procrustes: rmse 8.404145e-06  max resid 2.447679e-05 
-## ... Similar to previous best
-## Run 13 stress 0.0002969569 
-## ... Procrustes: rmse 0.0003866364  max resid 0.0006715474 
-## ... Similar to previous best
-## Run 14 stress 9.723199e-05 
-## ... Procrustes: rmse 3.731826e-05  max resid 0.0001336343 
-## ... Similar to previous best
-## Run 15 stress 9.99257e-05 
-## ... Procrustes: rmse 0.0001270356  max resid 0.0003614341 
-## ... Similar to previous best
-## Run 16 stress 9.955355e-05 
-## ... Procrustes: rmse 6.056256e-05  max resid 0.0001673759 
-## ... Similar to previous best
-## Run 17 stress 9.589429e-05 
-## ... Procrustes: rmse 1.686683e-05  max resid 4.596185e-05 
-## ... Similar to previous best
-## Run 18 stress 9.633493e-05 
-## ... Procrustes: rmse 3.660483e-05  max resid 0.0001324208 
-## ... Similar to previous best
-## Run 19 stress 9.921893e-05 
-## ... Procrustes: rmse 1.085938e-05  max resid 1.669484e-05 
-## ... Similar to previous best
-## Run 20 stress 9.637055e-05 
-## ... Procrustes: rmse 6.450683e-05  max resid 0.0001970587 
-## ... Similar to previous best
-## *** Solution reached
-
## Warning in metaMDS(ps.dist): Stress is (nearly) zero - you may have
-## insufficient data
-
plot_ordination(physeq.tree, uwUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  ggtitle("Unweighted UniFrac")
-

-
-
-

Ellipses

-

Ellipses can be plotted instead of points as well. With the basic plot function:

-
plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
legend(0.3, 0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-
-#Add an ellipse for 2w
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-
-#Add an ellipse for 8w
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-
-#Add an ellipse for 1yr
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

We can also plot ellipses in ggplot2. However, these ellipses are not the exact same at the standard error ellipses used with OTU-based metrics as they use different underlying calculations. However, they get at the same question of confidence intervals for groups of points on an nMDS.

-

We plot ellipses with ggplot2 by adding the stat_ellipse function to our plot.

-
plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  stat_ellipse() + 
-  ggtitle("Weighted UniFrac")
-

-
-
-

3D plots

-

3D UniFrac ordinations are not currently supported by phyloseq. We see that our ordinations only include 2 dimensions.

-
wUF.ordu
-
## 
-## Call:
-## metaMDS(comm = ps.dist) 
-## 
-## global Multidimensional Scaling using monoMDS
-## 
-## Data:     ps.dist 
-## Distance: user supplied 
-## 
-## Dimensions: 2 
-## Stress:     0.08645297 
-## Stress type 1, weak ties
-## Two convergent solutions found after 20 tries
-## Scaling: centring, PC rotation 
-## Species: scores missing
-
uwUF.ordu
-
## 
-## Call:
-## metaMDS(comm = ps.dist) 
-## 
-## global Multidimensional Scaling using monoMDS
-## 
-## Data:     ps.dist 
-## Distance: user supplied 
-## 
-## Dimensions: 2 
-## Stress:     9.488623e-05 
-## Stress type 1, weak ties
-## Two convergent solutions found after 20 tries
-## Scaling: centring, PC rotation 
-## Species: scores missing
-
-
-
-

Vectors for continuous variables

-

While it is easy to visualize categorical groups with coloring in nMDS, it is difficult to achieve the same effect with continuous variables. Instead, we can fit these variables as a vector on our nMDS plots.

-

To do this, we first fit the variables to our distances using the envfit function in vegan. You can do Bray-Curtis, Jaccard, weighted or unweighted UniFrac. Here, we will demonstrate with Bray-Curtis and weighted UniFrac.

-
fit.BC = envfit(BC.nmds, meta) 
-fit.BC
-
## 
-## ***VECTORS
-## 
-##             NMDS1    NMDS2     r2 Pr(>r)    
-## AgeExact -0.99887 -0.04744 0.9765  0.001 ***
-## ADGKG     0.12503  0.99215 0.0770  0.436    
-## chao     -0.98567  0.16868 0.9599  0.001 ***
-## shannon  -0.69400  0.71997 0.9469  0.001 ***
-## simpson   0.42087 -0.90712 0.7353  0.001 ***
-## ace      -0.99746  0.07129 0.9078  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##                   NMDS1   NMDS2
-## Animalcow5017   -0.1841  0.5449
-## Animalcow5020    0.0059  0.6577
-## Animalcow5026    0.4243 -0.8826
-## Animalcow5031   -0.2442  0.1175
-## Animalcow5037    0.4946 -0.0566
-## Animalcow5041    0.0500 -0.0290
-## Animalcow5045   -0.1374 -0.3384
-## Animalcow5053   -0.4090 -0.0134
-## AgeGroup1yr     -4.4470 -0.1800
-## AgeGroup2w       2.5047 -1.0509
-## AgeGroup8w       1.9422  1.2309
-## AgeGroup.ord2w   2.5047 -1.0509
-## AgeGroup.ord8w   1.9422  1.2309
-## AgeGroup.ord1yr -4.4470 -0.1800
-## 
-## Goodness of fit:
-##                  r2 Pr(>r)    
-## Animal       0.0248  1.000    
-## AgeGroup     0.9134  0.001 ***
-## AgeGroup.ord 0.9134  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

We see that it has automatically fit every variable in our meta table.

-

The simplest way around this is to just ask envfit to run on only the variables you want.

-
fit.BC = envfit(BC.nmds, meta[,c("AgeGroup", "ADGKG")])
-fit.BC
-
## 
-## ***VECTORS
-## 
-##         NMDS1   NMDS2    r2 Pr(>r)
-## ADGKG 0.12503 0.99215 0.077  0.488
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##               NMDS1   NMDS2
-## AgeGroup1yr -4.4470 -0.1800
-## AgeGroup2w   2.5047 -1.0509
-## AgeGroup8w   1.9422  1.2309
-## 
-## Goodness of fit:
-##              r2 Pr(>r)    
-## AgeGroup 0.9134  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

We repeat for weighted UniFrac

-
fit.wUF = envfit(wUF.ordu, meta[,c("AgeGroup", "ADGKG")])
-fit.wUF
-
## 
-## ***VECTORS
-## 
-##          NMDS1    NMDS2     r2 Pr(>r)
-## ADGKG -0.17846  0.98395 0.0398  0.651
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##               NMDS1   NMDS2
-## AgeGroup1yr -0.1076 -0.0834
-## AgeGroup2w   0.1432  0.0322
-## AgeGroup8w  -0.0356  0.0511
-## 
-## Goodness of fit:
-##              r2 Pr(>r)    
-## AgeGroup 0.5588  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

For categorical variables, envfit will label the centroid of the data for each group in the nMDS with that group’s name. For continuous variables, it adds an arrow in the direction from smallest to largest value.

-

Note: The P-values for variables in envfit are not equivalent to the P-values for our ANOVA/Kruskal/GLM tests. Instead, envfit P-values tell you how well the arrow or centroids fit the x-y data of the nMDS, not the underlying distance matrix. In general, if your nMDS is a good representation of the data (low stress value) and the variable was significant in its appropriate ANOVA/Kruskal/GLM test, the fitted arrow/centroids will also be significant. And if your nMDS is a good representation of the data and the variable was not significant, the fitted arrow/centroids will also not be significant. We see this type of result here, but this will not always be the case.

-

However, if your nMDS stress was borderline or not great and/or your variable was borderline significant or not, you may see divergent results for the arrow/centroid. This does not mean that the result you got in ANOVA/Kruskal/GLM was invalid. It just means that it’s difficult to visualize this result as a simple arrow or centroids on a 2D plot. Regardless, non-significant variables in envfit that you know are signficant in other tests may still be represented on an nMDS as a visual aid.

-

Thus, we plot our 2D nMDS colored by age with an arrow for the ADG variable even though that arrow was not significant. Since the ADG variable was also not significant in GLM, we probably won’t use these plot in a publication, but it is good practice.

-

For Bray-Curtis:

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC, col="black")
-

-

You could also ask it to only plot variables with a fit P-value < 0.05. So we would only see the centroids

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC, col="black", p.max=0.05)
-

-

Weighted UniFrac

-
plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(.3,.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.wUF, col="black")
-

-

You could also fit your OTU.clean table to the nMDS to add arrow(s) for specific OTUs within the plot. OTU arrows that, say, go in the same direction as an age group centroid tend to increase in abundance in that age group. The opposite direction would indicate that an OTU decreases in abundance in that age group.

-

Fitting all OTUs would take awhile so we will only fit the first 10 in our table.

-
fit.BC.OTU = envfit(BC.nmds, OTU.clean[,1:10])
-fit.BC.OTU
-
## 
-## ***VECTORS
-## 
-##             NMDS1    NMDS2     r2 Pr(>r)    
-## Otu00001  0.71738 -0.69668 0.2478  0.035 *  
-## Otu00002  0.46984 -0.88275 0.2109  0.083 .  
-## Otu00003  0.25719 -0.96636 0.2503  0.028 *  
-## Otu00004  0.25006  0.96823 0.2738  0.025 *  
-## Otu00005  0.15473  0.98796 0.2910  0.011 *  
-## Otu00006 -0.96867  0.24837 0.6743  0.001 ***
-## Otu00007  0.17991 -0.98368 0.2488  0.011 *  
-## Otu00008  0.40157  0.91583 0.3108  0.022 *  
-## Otu00009  0.26275 -0.96487 0.1894  0.060 .  
-## Otu00010  0.33868 -0.94090 0.1552  0.102    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-
#We will only plot significant arrows in this case
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC.OTU, col="black", p.max=0.05)
-

-

You could also think about plotting higher taxonomic levels like summed genera or family groups of OTUs.

-
#Extract all OTUs within the genus Ruminococcus
-OTU.Rumino = OTU.clean[,tax.clean$Genus == "g__Ruminococcus"]
-#Sum the abundances of the Ruminococcaceae OTUs into one variable (column)
-OTU.Rumino$Rumino.sum = rowSums(OTU.Rumino)
-
-#Fit the new Ruminococcaceae group
-fit.BC.Rumino = envfit(BC.nmds, OTU.Rumino$Rumino.sum)
-fit.BC.Rumino
-
## 
-## ***VECTORS
-## 
-##         NMDS1    NMDS2     r2 Pr(>r)    
-## [1,] -0.14506  0.98942 0.6621  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-
#Plot
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC.Rumino, col="black", labels=c("Ruminococcus"))
-

-
-
-
-

Statistically test beta-diversity

-

While nMDS gives us a visual of beta-diversity, it does not test for statistical differences. We do this with permutational analysis of variance (PERMANOVA) or analysis of similarity (ANOSIM). These test whether the overall microbial community differs by your variable of interest.

-

You can run them with Bray-Curtis, Jaccard, weighted or unweighted UniFrac to answer different questions. For example, if your variable is significant for Bray-Curtis/weighted UniFrac but not Jaccard/unweighted UniFrac, this means your groups tend to have the same OTUs (richness) but different abundances of those OTUs (diversity). When variables are signficant for Bray-Curtis/Jaccard but not UniFrac, this indicates that your samples have different specific OTUs but similar taxa. Like group 1 has a lot of Prevotella OTU1 and group 2 has a lot of Prevotella OTU2, but they are both Prevotella so UniFrac treats them as being very similar.

-
-

PERMANOVA

-

For Bray-Curtis or Jaccard, we use the vegan package to calculate distances and run PERMANOVA. As with ANOVA/glm of alpha-diversity, we want to include all variables that could interact in one model.

-

Note: adonis cannot handle or account for NA or blanks in your data. Subset to only samples with complete metadata before running vegdist if these exist.

-
#Calculate distance and save as a matrix
-BC.dist=vegdist(OTU.clean, distance="bray")
-#Run PERMANOVA on distances.
-adonis(BC.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000, method = "bray")
-
## 
-## Call:
-## adonis(formula = BC.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000,      method = "bray") 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.9720 1.98600  8.0116 0.44481 0.000999 ***
-## ADGKG           1    0.1979 0.19791  0.7984 0.02216 0.646354    
-## AgeGroup:ADGKG  2    0.2976 0.14881  0.6003 0.03333 0.922078    
-## Residuals      18    4.4620 0.24789         0.49969             
-## Total          23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Similarly for Jaccard

-
J.dist=vegdist(OTU.clean, distance="jaccard")
-adonis(J.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000, method = "jaccard")
-
## 
-## Call:
-## adonis(formula = J.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000,      method = "jaccard") 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.9720 1.98600  8.0116 0.44481 0.000999 ***
-## ADGKG           1    0.1979 0.19791  0.7984 0.02216 0.590410    
-## AgeGroup:ADGKG  2    0.2976 0.14881  0.6003 0.03333 0.924076    
-## Residuals      18    4.4620 0.24789         0.49969             
-## Total          23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

We see that the interaction is not significant so we remove it.

-
adonis(BC.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000, method = "bray")
-
## 
-## Call:
-## adonis(formula = BC.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000,      method = "bray") 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.9720 1.98600  8.3451 0.44481 0.000999 ***
-## ADGKG      1    0.1979 0.19791  0.8316 0.02216 0.599401    
-## Residuals 20    4.7597 0.23798         0.53302             
-## Total     23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
adonis(J.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000, method = "jaccard")
-
## 
-## Call:
-## adonis(formula = J.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000,      method = "jaccard") 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.9720 1.98600  8.3451 0.44481 0.000999 ***
-## ADGKG      1    0.1979 0.19791  0.8316 0.02216 0.559441    
-## Residuals 20    4.7597 0.23798         0.53302             
-## Total     23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

For UniFrac, we use the phyloseq package to calculate distances and then vegan to run PERMANOVA.

-
wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE)
-
## Warning in UniFrac(physeq.tree, weighted = TRUE, normalized = TRUE):
-## Randomly assigning root as -- Otu00949 -- in the phylogenetic tree in the
-## data you provided.
-
adonis(wUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = wUF.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2   1.03525 0.51763  5.8901 0.36735 0.000999 ***
-## ADGKG           1   0.09908 0.09908  1.1275 0.03516 0.321678    
-## AgeGroup:ADGKG  2   0.10195 0.05098  0.5801 0.03618 0.873127    
-## Residuals      18   1.58185 0.08788         0.56131             
-## Total          23   2.81814                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
uwUF.dist = UniFrac(physeq.tree, weighted=FALSE, normalized=TRUE)
-
## Warning in UniFrac(physeq.tree, weighted = FALSE, normalized = TRUE):
-## Randomly assigning root as -- Otu04503 -- in the phylogenetic tree in the
-## data you provided.
-
adonis(uwUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = uwUF.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.4929 1.74647  9.1892 0.47107 0.000999 ***
-## ADGKG           1    0.2360 0.23596  1.2415 0.03182 0.230769    
-## AgeGroup:ADGKG  2    0.2650 0.13248  0.6971 0.03573 0.813187    
-## Residuals      18    3.4210 0.19006         0.46137             
-## Total          23    7.4149                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Remove non-significant interaction term

-
adonis(wUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = wUF.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2   1.03525 0.51763  6.1483 0.36735 0.000999 ***
-## ADGKG      1   0.09908 0.09908  1.1769 0.03516 0.306693    
-## Residuals 20   1.68380 0.08419         0.59749             
-## Total     23   2.81814                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
adonis(uwUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = uwUF.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.4929 1.74647  9.4762 0.47107 0.000999 ***
-## ADGKG      1    0.2360 0.23596  1.2803 0.03182 0.217782    
-## Residuals 20    3.6860 0.18430         0.49711             
-## Total     23    7.4149                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
-
-

ANOSIM

-

If you have very different group sizes, you may consider analysis of similarities (ANOSIM) instead of PERMANOVA. This test does not assume equal group variances. However, it only allows simple 1 variable models with no interactions and can only be used for categorical (AgeGroup), not continuous (ADG) variables. So, ANOSIM has a lot of limitations and should only be used if you group sizes are very, very different, like 10 vs 100.

-

For example, Bray-Curtis:

-
anosim(BC.dist, meta$AgeGroup, permutations = 1000)
-
## 
-## Call:
-## anosim(dat = BC.dist, grouping = meta$AgeGroup, permutations = 1000) 
-## Dissimilarity: bray 
-## 
-## ANOSIM statistic R: 0.8467 
-##       Significance: 0.000999 
-## 
-## Permutation: free
-## Number of permutations: 1000
-

Overall, from the nMDS of various beta-diversity metrics (OTU- and phylogenetic-based) and statistical analyses, it is clear that age significantly impacts the fecal microbiota of dairy cows.

-
-
-

2D variables

-

These analyses are for comparing the microbiota to metadata that cannot fit in a single column and therefore, must be represented as a matrix of its own. For example, PERMANOVA can only tell you that the microbiota differs according to a single short chain fatty acid (SCFA), but other tests can tell you that the microbiota differs according to the overall SCFA profile. This section is also useful for comparing data if you have multiple OTU tables, like for bacteria, archaea, and fungi.

-

Mantel from vegan tests if two distance matrices co-vary e.g. does the data in matrix 1 change in the same way as the data in matrix 2. Like PERMANOVA, this test only tells you that the overall data co-vary, not which specific OTUs or SCFAs matter.

-

You can only compare samples were you have both types of data so we must subset our OTU table to only the samples that we also have SCFA for. The names are a little different between the tables so we also add “.F” to the SCFA names to make them match

-
OTU.SCFA = OTU.clean[row.names(OTU.clean) %in% paste(row.names(SCFA), ".F", sep=""),]
-

We then calculate distance matrices separately for each matrix. It is not necessary to do Bray-Curtis, Jaccard and UniFrac here since our SCFA data does not have any taxonomy to it.

-
dist1 = vegdist(OTU.SCFA)
-dist2 = vegdist(SCFA)
-

Run a Mantel test comparing the 2 matrices.

-
mantel(dist1, dist2, permutations=100)
-
## 'nperm' >= set of all permutations: complete enumeration.
-
## Set of permutations < 'minperm'. Generating entire set.
-
## 
-## Mantel statistic based on Pearson's product-moment correlation 
-## 
-## Call:
-## mantel(xdis = dist1, ydis = dist2, permutations = 100) 
-## 
-## Mantel statistic r: -0.02423 
-##       Significance: 0.54167 
-## 
-## Upper quantiles of permutations (null model):
-##   90%   95% 97.5%   99% 
-## 0.540 0.552 0.596 0.629 
-## Permutation: free
-## Number of permutations: 23
-

We see that the overall OTU table and SCFA tables do not co-vary.

-

You can also run Mantel on 3 matrices at once like so

-

Do not run as we do not have 3 matrices here

-
mantel.partial(dist1, dist2, dist3, permutations=100)
-
-
-
-

Beta dispersion

-

Sometimes it will be clear from nMDS that one group tends to vary more (be more spread out) than another group. You can test this statistically with multivariate homogeneity of group dispersion (variances).

-

Here is an example for Bray-Curtis. We use the same distance matrix we calculated for PERMANOVA/ANOSIM

-

Calculate dispersion (variances) within each group.

-
disp.age = betadisper(BC.dist, meta$AgeGroup)
-

Perform an ANOVA-like test to determine if the variances differ by groups.

-
permutest(disp.age, pairwise=TRUE, permutations=1000)
-
## 
-## Permutation test for homogeneity of multivariate dispersions
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Response: Distances
-##           Df  Sum Sq  Mean Sq     F N.Perm   Pr(>F)    
-## Groups     2 0.47459 0.237293 30.93   1000 0.000999 ***
-## Residuals 21 0.16111 0.007672                          
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## Pairwise comparisons:
-## (Observed p-value below diagonal, permuted p-value above diagonal)
-##            1yr         2w     8w
-## 1yr            9.9900e-04 0.0010
-## 2w  4.8556e-06            0.7622
-## 8w  1.2886e-06 7.7206e-01
-

Combining this with our plot,

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-legend(.6,-2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

we see that 2 week and 8 week calves have similar variability in their fecal microbiotas but that both 2- and 8-week calves have more variable fecal microbiotas than 1-year heifers.

-
-
-
-

OTUs that differ by

-
-

Categorical variables

-

Just because the overall microbiota does or does not differ between age groups, does not mean specific OTUs do or don’t differ by age. However, it is inadvisable to just test all OTUs in your data set against all variables of interest. Since you are running multiple similar tests, you need to apply a false discovery rate (fdr) correctios and correcting across all OTUs (5002 in this data set) will most likely result in no significant results after fdr correction. Also, you don’t want to look at over 5000 P-values, do you?

-

There are a number of way to decrease the number of OTUs you’re looking at

-
    -
  1. Don’t use OTUs. Add together genus or family groups and test if all or some of these taxa differ across variables of interest
  2. -
  3. Apply an abundance cutoff such as only looking at OTUs/taxa that are at least 1% abundance in at least one sample
  4. -
  5. Apply a frequency cutoff such as only looking at OTUs/taxa that occur in at least 50% of samples
  6. -
  7. Combine 2 and 3
  8. -
-

However, some of these methods are somewhat arbitrary. How do you pick an abundance or frequency cutoff? What if a low abundant OTU is of interest? And what if you are interested in possible species-level differences (OTUs) so high taxonomic levels aren’t useful?

-

So, one way to non-arbitrarily select OTUs/taxa of interest is similarity percentages (SIMPER). SIMPER identifies the OTUs that most contribute to beta-diversity measures. These OTUs are the most abundant and/or most variable OTUs in the data set. Note: SIMPER outputs all pairwise comparisons (A-B, B-C, A-C, etc.) and thus, only works for categorical variables.

-

SIMPER’s output is a list of OTUs which cumulatively explain 70%+ of the variation between each comparison. The numbers below the OTUs are cumulative, so to get each OTU’s contribution, you must subtract the previous OTU’s value.

-

For example

-
simper(OTU.clean, meta$AgeGroup, permutations=100)
-
## cumulative contributions of most influential species:
-## 
-## $`1yr_2w`
-##  Otu00002  Otu00001  Otu00003  Otu00007  Otu00011  Otu00006  Otu00009 
-## 0.0983761 0.1627191 0.2225335 0.2657879 0.2982889 0.3271508 0.3514210 
-##  Otu00014  Otu00022  Otu00018  Otu00012  Otu00016  Otu00004  Otu00021 
-## 0.3660756 0.3793171 0.3924608 0.4048922 0.4171422 0.4283988 0.4385280 
-##  Otu00008  Otu00025  Otu00028  Otu00023  Otu00037  Otu00013  Otu00035 
-## 0.4479076 0.4565849 0.4646081 0.4723795 0.4790690 0.4857141 0.4920793 
-##  Otu00055  Otu00030  Otu00036  Otu00040  Otu00042  Otu00010  Otu00049 
-## 0.4983615 0.5045449 0.5106265 0.5166717 0.5226378 0.5274331 0.5321886 
-##  Otu00046  Otu00033  Otu00031  Otu00081  Otu00051  Otu00064  Otu00056 
-## 0.5368030 0.5413764 0.5458188 0.5500936 0.5543565 0.5582465 0.5620674 
-##  Otu00032  Otu00052  Otu00062  Otu00026  Otu00020  Otu00074  Otu00069 
-## 0.5657989 0.5695078 0.5730822 0.5765920 0.5799406 0.5831741 0.5864067 
-##  Otu00066  Otu00077  Otu00148  Otu00073  Otu00067  Otu00065  Otu00076 
-## 0.5895953 0.5927428 0.5958511 0.5989588 0.6020549 0.6051241 0.6081334 
-##  Otu00075  Otu00091  Otu00048  Otu00097  Otu00068  Otu00050  Otu00084 
-## 0.6111073 0.6140400 0.6169121 0.6196512 0.6223697 0.6250661 0.6277023 
-##  Otu00100  Otu00019  Otu00063  Otu00039  Otu00086  Otu00071  Otu00101 
-## 0.6303356 0.6329664 0.6355752 0.6381709 0.6406744 0.6431362 0.6455850 
-##  Otu00089  Otu00096  Otu00095  Otu00108  Otu00088  Otu00103  Otu00094 
-## 0.6480310 0.6504700 0.6528884 0.6553007 0.6576757 0.6600472 0.6624184 
-##  Otu00098  Otu00116  Otu00090  Otu00105  Otu00104  Otu00099  Otu00059 
-## 0.6647575 0.6670589 0.6693444 0.6716046 0.6738590 0.6760506 0.6781917 
-##  Otu00106  Otu00115  Otu00102  Otu00110  Otu00119  Otu00118  Otu00034 
-## 0.6803196 0.6824245 0.6844633 0.6865021 0.6884972 0.6904775 0.6924261 
-##  Otu00114  Otu00093  Otu00124  Otu00045 
-## 0.6943714 0.6962690 0.6981558 0.7000319 
-## 
-## $`1yr_8w`
-##   Otu00001   Otu00005   Otu00006   Otu00004   Otu00010   Otu00017 
-## 0.03765603 0.07335078 0.10010930 0.12226268 0.14087762 0.15688502 
-##   Otu00008   Otu00009   Otu00015   Otu00018   Otu00016   Otu00014 
-## 0.17205091 0.18718833 0.20107546 0.21456235 0.22713556 0.23964967 
-##   Otu00029   Otu00019   Otu00021   Otu00025   Otu00024   Otu00037 
-## 0.25102468 0.26162658 0.27202671 0.28093293 0.28829315 0.29516652 
-##   Otu00035   Otu00044   Otu00055   Otu00027   Otu00036   Otu00040 
-## 0.30170335 0.30821052 0.31465848 0.32109529 0.32733731 0.33354206 
-##   Otu00042   Otu00020   Otu00013   Otu00041   Otu00003   Otu00043 
-## 0.33966556 0.34564370 0.35158279 0.35717451 0.36261926 0.36799345 
-##   Otu00038   Otu00026   Otu00034   Otu00049   Otu00070   Otu00046 
-## 0.37334038 0.37836130 0.38334135 0.38822230 0.39310161 0.39783775 
-##   Otu00012   Otu00058   Otu00011   Otu00051   Otu00054   Otu00045 
-## 0.40234701 0.40670755 0.41102172 0.41521298 0.41939306 0.42353985 
-##   Otu00047   Otu00064   Otu00056   Otu00052   Otu00048   Otu00002 
-## 0.42764688 0.43163954 0.43556497 0.43937178 0.44313291 0.44683135 
-##   Otu00062   Otu00031   Otu00057   Otu00061   Otu00053   Otu00074 
-## 0.45050368 0.45405112 0.45759807 0.46109474 0.46455875 0.46787762 
-##   Otu00069   Otu00066   Otu00077   Otu00073   Otu00067   Otu00079 
-## 0.47119548 0.47447192 0.47770248 0.48089214 0.48406988 0.48721802 
-##   Otu00083   Otu00078   Otu00076   Otu00075   Otu00091   Otu00121 
-## 0.49033806 0.49342871 0.49651735 0.49956976 0.50257978 0.50549547 
-##   Otu00097   Otu00092   Otu00032   Otu00084   Otu00129   Otu00050 
-## 0.50830678 0.51111612 0.51389884 0.51660098 0.51922111 0.52181856 
-##   Otu00100   Otu00101   Otu00096   Otu00108   Otu00095   Otu00086 
-## 0.52434751 0.52686095 0.52936793 0.53184756 0.53429667 0.53674109 
-##   Otu00089   Otu00088   Otu00103   Otu00094   Otu00098   Otu00116 
-## 0.53918547 0.54162316 0.54405719 0.54649097 0.54889172 0.55125394 
-##   Otu00105   Otu00104   Otu00143   Otu00123   Otu00082   Otu00039 
-## 0.55357747 0.55589135 0.55819397 0.56049152 0.56278380 0.56503978 
-##   Otu00099   Otu00130   Otu00090   Otu00106   Otu00107   Otu00115 
-## 0.56728918 0.56953083 0.57176616 0.57395024 0.57611979 0.57828018 
-##   Otu00087   Otu00153   Otu00102   Otu00110   Otu00119   Otu00118 
-## 0.58042631 0.58252590 0.58461849 0.58671108 0.58875879 0.59079874 
-##   Otu00022   Otu00072   Otu00080   Otu00093   Otu00124   Otu00112 
-## 0.59281824 0.59481609 0.59678509 0.59873275 0.60067308 0.60260107 
-##   Otu00122   Otu00131   Otu00132   Otu00134   Otu00128   Otu00125 
-## 0.60450552 0.60639869 0.60828362 0.61014314 0.61199594 0.61383412 
-##   Otu00133   Otu00159   Otu00139   Otu00127   Otu00114   Otu00137 
-## 0.61566158 0.61747930 0.61928689 0.62106367 0.62282385 0.62455846 
-##   Otu00136   Otu00194   Otu00138   Otu00144   Otu00142   Otu00135 
-## 0.62629042 0.62801571 0.62974033 0.63143945 0.63312281 0.63480281 
-##   Otu00147   Otu00120   Otu00188   Otu00126   Otu00028   Otu00211 
-## 0.63647550 0.63814069 0.63980299 0.64140642 0.64300322 0.64457174 
-##   Otu00154   Otu00146   Otu00173   Otu00156   Otu00158   Otu00157 
-## 0.64612078 0.64764950 0.64917769 0.65068721 0.65217234 0.65364696 
-##   Otu00060   Otu00168   Otu00140   Otu00163   Otu00171   Otu00113 
-## 0.65508066 0.65651008 0.65793253 0.65931862 0.66069801 0.66207484 
-##   Otu00178   Otu00200   Otu00165   Otu00170   Otu00164   Otu00187 
-## 0.66344999 0.66480785 0.66616041 0.66748648 0.66881018 0.67012189 
-##   Otu00151   Otu00213   Otu00149   Otu00183   Otu00192   Otu00167 
-## 0.67141176 0.67269928 0.67397558 0.67525135 0.67652371 0.67778788 
-##   Otu00177   Otu00181   Otu00180   Otu00236   Otu00186   Otu00199 
-## 0.67904574 0.68029263 0.68151160 0.68272731 0.68393783 0.68512983 
-##   Otu00253   Otu00150   Otu00204   Otu00169   Otu00218   Otu00189 
-## 0.68632029 0.68750539 0.68867418 0.68982822 0.69097221 0.69210846 
-##   Otu00182   Otu00184   Otu00226   Otu00270   Otu00172   Otu00225 
-## 0.69323878 0.69436709 0.69548866 0.69660494 0.69770318 0.69878699 
-##   Otu00185   Otu00203 
-## 0.69986670 0.70093653 
-## 
-## $`2w_8w`
-##  Otu00002  Otu00001  Otu00003  Otu00007  Otu00009  Otu00005  Otu00011 
-## 0.1101390 0.1804133 0.2466786 0.2952479 0.3351854 0.3745198 0.4100899 
-##  Otu00004  Otu00010  Otu00017  Otu00008  Otu00012  Otu00015  Otu00022 
-## 0.4397781 0.4641945 0.4818672 0.4987872 0.5154942 0.5307997 0.5454777 
-##  Otu00029  Otu00013  Otu00019  Otu00020  Otu00028  Otu00006  Otu00023 
-## 0.5580145 0.5704325 0.5824230 0.5910912 0.5996473 0.6081657 0.6166261 
-##  Otu00024  Otu00027  Otu00031  Otu00044  Otu00030  Otu00041  Otu00043 
-## 0.6247348 0.6322130 0.6396626 0.6468237 0.6539027 0.6600291 0.6659522 
-##  Otu00038  Otu00032  Otu00026  Otu00070  Otu00033  Otu00034  Otu00047 
-## 0.6718453 0.6776585 0.6834157 0.6887933 0.6940870 0.6992933 0.7044391
-

We see a number of OTUs that may differ between 1 or more age comparisons. However, these are just the OTUs that most contribute to Bray-Curtis measures between our age groups. They are not necessarily significantly different.

-

To test significance, we compare the relative abundance of an OTU across our age groups with Kruskal-Wallis (OTU abundance is never normally distributed, trust me). For example, OTU1 occurs in all SIMPER age comparisons and does, in fact, significantly differ by age.

-
kruskal.test(OTU.clean$Otu00001 ~ meta$AgeGroup)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  OTU.clean$Otu00001 by meta$AgeGroup
-## Kruskal-Wallis chi-squared = 15.994, df = 2, p-value = 0.0003364
-

In contrast, OTU17 occurs in SIMPER but does not actually significantly differ by age group

-
kruskal.test(OTU.clean$Otu00017 ~ meta$AgeGroup)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  OTU.clean$Otu00017 by meta$AgeGroup
-## Kruskal-Wallis chi-squared = 4.9767, df = 2, p-value = 0.08305
-

Note: These P-values have not been corrected from false discovery rate (fdr) yet.

-

Now, it would be very tedious to individually test every variable of interest in SIMPER and then test every SIMPER OTU in Kruskal-Wallis. So, Andrew Steinberger (Suen lab) has written two scripts to simplify both SIMPER and Kruskal-Wallis of SIMPER OTUs. The latest versions can be found on his GitHub page and we have provided them for this workshop in /Steinberger_scripts

-

Disclaimer Andrew has provided these scripts out of the goodness of his heart and provides no guarentee that they will work for your exact data set or with new versions of R/RStudio/vegan. You may contact him through GitHub with issues or errors, but it is not his job to troubleshoot for you. He may or may not address your concerns in an updated version of the scripts at a later time.

-

The use of these scripts are as follows (from Steinberger GitHub with some modifications)

-

simper_pretty.R

-

This script is meant to rapidly perform the SIMPER function from the R package vegan for all comparisons of interest in a data set. Inputs are OTU and metadata tables, and the output is a .csv. User can tailor contents of .csv by setting perc_cutoff, low_cutoff, and low_val. This function can also handle taxonomic levels instead of OTU, but currently only select formats are compatible. Requires installation of the R package ‘vegan’.

-

Usage:

-

simper.pretty(x, metrics, c(‘interesting’), perc_cutoff=0.5, low_cutoff = ‘y’, low_val=0.01, ‘output_name’)

-

Inputs:

-
    -
  • x: OTU table
  • -
  • metrics: metadata table
  • -
  • interesting: a list of the column headers for the columns of interest in the metrics file. e.g. c(‘int1’,‘int2’,‘int3’)
  • -
  • perc_cutoff: % cutoff for output OTUs, as decimal (i.e. write 50% as 0.5), larger % increases number OTUs in output.
  • -
  • low_cutoff: ‘y’ if want to REMOVE OTUs that contribute less than 1%
  • -
  • low_val: set value of low cutoff (0.01), ignored if low_cutoff=‘n’.
  • -
  • output_name: the name that is appended to the output filename “_clean_simper.csv“.
  • -
-

R_krusk.R

-

This script takes the output .csv of simper_pretty.R, and the OTU/metadata/taxonomy tables, and performs the non-parametric Kruskal-Wallis rank-sum test on each OTU in the .csv file. Output is a .csv file containing the same contents of simper.pretty output with the following info: p-value, fdr corrected p-value, OTU taxonomic classification (if applicable), mean rel. abund and std dev of otu/tax_lvl in group 1 of comparison, and mean rel. abund and std dev of otu/tax_lvl in group 2 of comparison. Requires installation of R packages ‘vegan’ and ‘dplyr’.

-

Usage:

-

kruskal.pretty(x, metrics, csv, c(‘interesting’), ‘output_name’, taxonomy)

-

Inputs:

-
    -
  • x: OTU table
  • -
  • metrics: metadata table
  • -
  • csv: output from simper.pretty, must be imported as data.frame. e.g. csv= data.frame(read.csv(“PATH to name_clean_simper.csv”))
  • -
  • interesting: a list of the column headers for the columns of interest in the metrics file, should be same as simper.pretty inputs. e.g. c(‘int1’,‘int2’,‘int3’)
  • -
  • output_name= the name that is appended to the output filename “_krusk_simper.csv“.
  • -
  • taxonomy: The .taxonomy file output from classify.otu command in mothur. This is the UNALTERED tax file, not tax.clean (optional)
  • -
-

First, we load these functions into R.

-
source("Steinberger_scripts/simper_pretty.r")
-source("Steinberger_scripts/R_krusk.r")
-

Then, we apply them to our data. We will ask for all SIMPER OTUs (perc_cutoff = 1, meaning up to cumulative 100%) but cutoff any OTUs that individually contribute less than 1% to SIMPER (low_val=0.01). You may want to consider different cutoffs for your data.

-
simper.pretty(OTU.clean, meta, c('AgeGroup'), perc_cutoff=1, low_cutoff = 'y', low_val=0.01, 'Age')
-
-simper.results = data.frame(read.csv("Age_clean_simper.csv"))
-kruskal.pretty(OTU.clean, meta, simper.results, c('AgeGroup'), 'Age', tax)
-

If we import the Kruskal-Wallis back into R and select only OTUs there were significantly different after fdr correction (fdr_krusk_p.val)…

-
#Import
-KW.results = data.frame(read.csv("Age_krusk_simper.csv"))
-#Remove non-significant
-KW.results.signif = KW.results[KW.results$fdr_krusk_p.val < 0.05,]
-#Order by OTU#
-KW.results.signif = KW.results.signif[with(KW.results.signif, order(OTU)),]
-head(KW.results.signif)
-
##     X Comparison     SIMPER      OTU  krusk_p.val fdr_krusk_p.val
-## 2   2     1yr_2w 0.06434298 Otu00001 0.0004510953     0.001383359
-## 15 15     1yr_8w 0.03765603 Otu00001 0.0004510953     0.001383359
-## 1   1     1yr_2w 0.09837610 Otu00002 0.0004510953     0.001383359
-## 30 30      2w_8w 0.11013903 Otu00002 0.0208625823     0.029989962
-## 3   3     1yr_2w 0.05981442 Otu00003 0.0003310658     0.001383359
-## 32 32      2w_8w 0.06626526 Otu00003 0.0356919001     0.044373714
-##                                                                                                                   Taxonomy
-## 2          k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 15         k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 1          k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 30         k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 3  k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;
-## 32 k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;
-##    Left.mean.abund   Left.stdev Right.mean.abund Right.stdev
-## 2     7.109140e-06 2.010768e-05      0.128370197  0.16351829
-## 15    7.109140e-06 2.010768e-05      0.073292635  0.09803742
-## 1     7.118451e-06 2.013402e-05      0.196185324  0.23796423
-## 30    1.961853e-01 2.379642e-01      0.007205221  0.01601067
-## 3     0.000000e+00 0.000000e+00      0.119333403  0.18000346
-## 32    1.193334e-01 1.800035e-01      0.010598818  0.02126522
-

we see a number of OTU that significantly differ by age group.

-

Looking at OTU1 as relative abundance

-
#Calculate abundance
-abund = OTU.clean/rowSums(OTU.clean)*100
-#plot
-boxplot(abund$Otu00001 ~ meta$AgeGroup.ord, ylab="% Relative abundance", main="OTU1")
-

-

and using the P-values in KW.results.signif, we can say that OTU1 is significantly less abundant in 1yr animals compared to either 2w or 8w calves.

-
-
-

Continuous variables

-

For continuous variables, there is no simple test like SIMPER to pull out OTUs likely to differ across your variable. You could run linear models glm of the OTU abundances with different distributions family= similar to what we did with Chao richness. However, OTU abundance data is not normal nor does it fit well with other standard distributions due to its many zeros. So, you will need to test a number of distributions and transformations of the data to find a suitable model.

-
-
-

Correlations

-

So, you can also approach continuous variables as correlations. Generally, only strong correlations (r > 0.5 or r < -0.5) should be reported and if you have a lot that fall into the “strong” category, you can up the cut off, say, to r > 0.75 or r < -0.75. There are many correlation options. I like Kendall-Tau because it does not assume linearity or normality. Type ??cor in the R console to learn others that are available.

-

Also, consider options to decrease the number of OTUs tested or you will be dealing with a huge table. Like only ones at >X% abundance? Only ones found in SIMPER and/or KW analyses of other important variables?

-

Here, we will correlate ADG to OTUs with at least 5% relative abundance in at least one sample in our data set.

-
#Remember we calculated abundance before with
-#abund = OTU.clean/rowSums(OTU.clean)*100
-
-#Subset OTUs to abundance cutoff
-OTU.abund = OTU.clean[, apply(abund, MARGIN=2, function(x) any(x > 5))]
-
-cor.kendall = cor(OTU.abund, meta$ADGKG, method = "kendall")
-cor.kendall
-
##                  [,1]
-## Otu00001  0.189852125
-## Otu00002  0.211764129
-## Otu00003  0.027397313
-## Otu00004  0.275867615
-## Otu00005  0.165056323
-## Otu00006 -0.114462240
-## Otu00007  0.143930930
-## Otu00008  0.211764129
-## Otu00009 -0.177517901
-## Otu00010  0.176299258
-## Otu00011  0.208334326
-## Otu00012  0.017236256
-## Otu00013  0.269669049
-## Otu00015  0.018077538
-## Otu00016 -0.257293680
-## Otu00017  0.284293111
-## Otu00019  0.172479145
-## Otu00020  0.102188122
-## Otu00022 -0.034040152
-## Otu00023  0.004106646
-## Otu00024  0.073416202
-## Otu00027  0.412640807
-## Otu00029  0.076924424
-## Otu00030 -0.077670805
-## Otu00031  0.286002668
-## Otu00038 -0.271163072
-## Otu00041  0.125193349
-## Otu00043  0.189645652
-## Otu00044  0.239065695
-## Otu00053 -0.217652255
-## Otu00055 -0.112428004
-## Otu00070 -0.037317590
-

In this case, we don’t see any strong correlations. However, if we did, we could use those OTUs as our list of ones that are of interest to check for significance with glm.

-

Next, we will correlate SCFAs with OTUs with at least 1% relative abundance in at least one sample in our data set. We will use only samples for which we also have SCFA data.

-
#Calculate abundances
-abund.SCFA = OTU.SCFA/rowSums(OTU.SCFA)*100
-
-#Subset OTUs to abundance cutoff
-OTU.SCFA.abund = OTU.SCFA[, apply(abund.SCFA, MARGIN=2, function(x) any(x > 1))]
-
-cor.kendall = cor(OTU.SCFA.abund, SCFA, method = "kendall")
-cor.kendall
-
##             Formate    Acetate Propionate Isobutyrate   Butyrate
-## Otu00006  0.0000000  0.1825742  0.1825742   0.1825742  0.1825742
-## Otu00014  0.1825742  0.3333333  0.3333333   0.0000000  0.3333333
-## Otu00016 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00018 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00021 -0.9128709 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00025  0.9128709  0.6666667  0.6666667   0.3333333  0.6666667
-## Otu00035 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00036 -0.5477226 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00037 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00040 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00042  0.1825742  0.3333333  0.3333333   0.0000000  0.3333333
-## Otu00046 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00049 -0.1825742 -0.3333333 -0.3333333   0.0000000 -0.3333333
-## Otu00051  0.5477226  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00052 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00056 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00064 -0.5477226 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00066 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00067  0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00069  0.5477226  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00074  0.5477226  0.6666667  0.6666667   0.3333333  0.6666667
-## Otu00077  0.1825742  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00088  0.1825742  0.0000000  0.0000000  -0.3333333  0.0000000
-## Otu00089  0.1825742  0.0000000  0.0000000  -0.3333333  0.0000000
-## Otu00097 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00100 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00113 -0.5477226 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00192  0.5477226  0.6666667  0.6666667   1.0000000  0.6666667
-## Otu00295  0.2581989  0.2357023  0.2357023   0.7071068  0.2357023
-##            iVal.2MB   Valerate
-## Otu00006 -0.1825742  0.1825742
-## Otu00014 -0.3333333  0.0000000
-## Otu00016 -0.3333333 -0.6666667
-## Otu00018 -0.3333333 -0.6666667
-## Otu00021 -0.6666667 -0.3333333
-## Otu00025  0.6666667  0.3333333
-## Otu00035 -0.6666667 -1.0000000
-## Otu00036  0.0000000 -0.3333333
-## Otu00037  0.0000000  0.3333333
-## Otu00040 -0.6666667 -1.0000000
-## Otu00042 -0.3333333  0.0000000
-## Otu00046 -0.3333333 -0.6666667
-## Otu00049  0.3333333  0.0000000
-## Otu00051  1.0000000  0.6666667
-## Otu00052 -0.6666667 -1.0000000
-## Otu00056 -0.3333333 -0.6666667
-## Otu00064 -1.0000000 -0.6666667
-## Otu00066 -0.6666667 -1.0000000
-## Otu00067  0.6666667  0.3333333
-## Otu00069  1.0000000  0.6666667
-## Otu00074  0.0000000  0.3333333
-## Otu00077  0.3333333  0.6666667
-## Otu00088  0.0000000 -0.3333333
-## Otu00089  0.0000000 -0.3333333
-## Otu00097  0.0000000  0.3333333
-## Otu00100  0.0000000  0.3333333
-## Otu00113  0.0000000 -0.3333333
-## Otu00192  0.6666667  1.0000000
-## Otu00295  0.7071068  0.7071068
-

If the data table is too large to view in R, you can write it to a table in your project folder.

-
write.table(cor.kendall, file = "cor_kendall.csv", sep = ",")
-

We see that some OTUs strongly correlation with a SCFAs. For example, Otu00021 and Otu00025 with Formate

-
par(mfrow = c(1, 2))
-plot(abund.SCFA$Otu00021 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU21")
-plot(abund.SCFA$Otu00025 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU25")
-

-

Clearly we don’t have enough data points to make strong conclusions here and the correlations are being driven by one animal with very high formate. However, we could further test the list of OTUs that correlate strongly with SCFAs. We will assume a normal distribution here, but you should assess your models with plot() to make sure they are a good fit.

-
OTU21.Formate = glm(OTU.SCFA$Otu00021 ~ SCFA$Formate)
-summary(OTU21.Formate)
-
## 
-## Call:
-## glm(formula = OTU.SCFA$Otu00021 ~ SCFA$Formate)
-## 
-## Deviance Residuals: 
-##       1        2        3        4  
-## -56.173   96.253  -46.747    6.668  
-## 
-## Coefficients:
-##              Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)    357.75      51.46   6.952   0.0201 *
-## SCFA$Formate  -540.02     201.13  -2.685   0.1152  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 7324.907)
-## 
-##     Null deviance: 67454  on 3  degrees of freedom
-## Residual deviance: 14650  on 2  degrees of freedom
-## AIC: 50.175
-## 
-## Number of Fisher Scoring iterations: 2
-
OTU25.Formate = glm(OTU.SCFA$Otu00025 ~ SCFA$Formate)
-summary(OTU25.Formate)
-
## 
-## Call:
-## glm(formula = OTU.SCFA$Otu00025 ~ SCFA$Formate)
-## 
-## Deviance Residuals: 
-##        1         2         3         4  
-##  127.727  -118.783     6.217   -15.162  
-## 
-## Coefficients:
-##              Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)    219.78      74.49   2.951   0.0982 .
-## SCFA$Formate   721.00     291.12   2.477   0.1316  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 15346.04)
-## 
-##     Null deviance: 124819  on 3  degrees of freedom
-## Residual deviance:  30692  on 2  degrees of freedom
-## AIC: 53.133
-## 
-## Number of Fisher Scoring iterations: 2
-

So, we see that these two OTUs do not significantly differ with Formate concentration even though they had very strong Kendall correlations. This is similar to OTUs occuring in SIMPER that do not hold up to subsequent Kruskal-Wallis testing.

-
-
-
-

Other visualizations

-
-

Bar charts

-

The phyloseq object we created with our OTU, meta, tax, and tree data (physeq.tree) can also be used in a number of other plot functions in the phyloseq / ggplot2 packages.

-

Let’s explore some of the bar chart options. First, we’ll make the classic additive bar chart for phyla in our samples

-
plot_bar(physeq.tree, fill="Phylum")
-

-

We can simplify by grouping our samples by age group

-
plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") 
-

-

And removing the lines between OTUs in the bars

-
plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

And only showing the top 5 most abundant phyla

-
#Sort the Phyla by abundance and pick the top 5
-top5P.names = sort(tapply(taxa_sums(physeq.tree), tax_table(physeq.tree)[, "Phylum"], sum), TRUE)[1:5]
-#Cut down the physeq.tree data to only the top 10 Phyla
-top5P = subset_taxa(physeq.tree, Phylum %in% names(top5P.names))
-#Plot
-plot_bar(top5P, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

There are many more options within ggplot2 to alter this figure. This document has many helpful tips.

-

Another way to simplify these bar plots is to not show all OTUs for one sample in one bar. We can do this with facet_grid

-
plot_bar(top5P, x="AgeGroup", fill="Phylum", facet_grid = ~Phylum) + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

And you can break it down at any taxonomic level and color by any other level.

-
-
-

Trees

-

We can also plot phylogenetic trees and label/modify them by our variables of interest.

-

Let’s look at the genus Prevotella in our data. We want to subset down to just this genus or else our plot would be too cluttered to read.

-

Subset by genus

-
prevotella = subset_taxa(physeq.tree, Genus == "g__Prevotella")
-

We can see that this worked by comparing the number of taxa in our subset and our original data

-
physeq.tree
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 5002 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 5002 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 5002 tips and 5000 internal nodes ]
-
prevotella
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 106 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 106 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 106 tips and 105 internal nodes ]
-

We can plot these OTUs on a tree.

-
plot_tree(prevotella, plot.margin = 0.5, ladderize = TRUE)
-

-

In the figure, each OTU is represented by the end branch of the tree. How many samples that OTU occurs in is represented by the black dots.

-

Let’s make this figure a little more useful and add 1) Colors to the dots for our age groups, 2) Size to the dots to show OTU abundance, and 3) Species level labels for the OTUs

-
plot_tree(prevotella, color = "AgeGroup", label.tips = "Species", size = "abundance", plot.margin = 0.5, ladderize = TRUE)
-

-

Already it’s a little difficult to read. You can view a larger page by clicking “Zoom” above the figure. Or export the figure as a PDF and save as a full page size, 9.5x11.

-

There are even more customizable options in this figure. Type ?plot_tree into the console to see the help page explaining all the options.

-
-
-

Heat maps

-

There are some good options in both phyloseq and gplots to make heatmaps. We will go through phyloseq but know that the same things could be done in gplots with code specific to that package.

-
-

OTU abundance

-

We’re going to just look at the 20 most abundant OTUs to make it more readable.

-
#Sort the OTUs by abundance and pick the top 20
-top20OTU.names = names(sort(taxa_sums(physeq.tree), TRUE)[1:20])
-#Cut down the physeq.tree data to only the top 10 Phyla
-top20OTU = prune_taxa(top20OTU.names, physeq.tree)
-

We now see that we only have 20 taxa

-
top20OTU
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 20 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 20 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 20 tips and 19 internal nodes ]
-

First, you can make a heatmap of OTU abundance across all samples

-
plot_heatmap(top20OTU)
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

And grouped by our age groups

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

We can label the OTU taxa

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

And group them

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

We can also change the colors, including the 0s/NA which are most commonly colored the same as the lowest abundance (white here).

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum", low="white", high="purple", na.value="white")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

You can also have R automatically group your OTUs and samples by beta-diversity. This may yield the most easily interpreted heatmap but if you have a specific research question that is better addressed by your own ordering (like our age groups above), you should stick with that. We’ll show Bray-Curtis as an example. Other options are

-
    -
  • bray
  • -
  • jaccard
  • -
  • wunifrac
  • -
  • uwunifrac
  • -
-
plot_heatmap(top20OTU, "NMDS", "bray", title="Bray-Curtis")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-
-
-

Beta-diversity

-

The other common use for heatmaps is to show distances between samples (i.e. beta-diversity) similar to what is shown in nMDS. We have all of the same metric options as we did for nMDS.

-

We do not want to use the plot_heatmap() function from phyloseq because it requires the input of a physeq object. Instead, we can use our distance matrices as inputs for a gplots command. This command will automatically group samples by similarity (trees)

-
#Bray-Curtis
-heatmap.2(as.matrix(BC.dist))
-

-
#UniFrac
-heatmap.2(as.matrix(wUF.dist))
-

-

You could also change the colors

-
#Rainbow colors
-rc <- rainbow(nrow(as.matrix(BC.dist)), start=0, end=0.9)
-heatmap.2(as.matrix(BC.dist), col=rc)
-

-

As always, for further customization, explore with ?heatmap.2

-
-
-
-

Venn Diagrams

-

Venn diagram of three samples: 5017.2w.F, 5017.8w.F, and 5017.1yr.F

-

Create a list of OTUs that occur (count > 0) in each sample.

-
    -
  • We select for the row by name with OTU.clean[“name”,]
  • -
  • We select the columns with a value >0 with OTU.clean[,apply()]
  • -
-
OTU.5017.2w = colnames(OTU.clean["5017.2w.F", apply(OTU.clean["5017.2w.F",], MARGIN=2, function(x) any(x >0))])
-
-OTU.5017.8w = colnames(OTU.clean["5017.8w.F", apply(OTU.clean["5017.8w.F",], MARGIN=2, function(x) any(x >0))])
-
-OTU.5017.1yr = colnames(OTU.clean["5017.1yr.F",apply(OTU.clean["5017.1yr.F",], MARGIN=2, function(x) any(x >0))])
-

We can then use these lists of OTUs to plot a Venn diagram with venn() from the gplots package

-
venn(list(OTU.5017.2w, OTU.5017.8w, OTU.5017.1yr))
-

-

We can also do this for our age groups by selecting all samples where meta$AgeGroup = 2w, 8w, or 1yr

-
OTU.2w = colnames(OTU.clean[meta$AgeGroup == "2w", apply(OTU.clean[meta$AgeGroup == "2w",], MARGIN=2, function(x) any(x >0))])
-
-OTU.8w = colnames(OTU.clean[meta$AgeGroup == "8w", apply(OTU.clean[meta$AgeGroup == "8w",], MARGIN=2, function(x) any(x >0))])
-
-OTU.1yr = colnames(OTU.clean[meta$AgeGroup == "1yr", apply(OTU.clean[meta$AgeGroup == "1yr",], MARGIN=2, function(x) any(x >0))])
-

And plot

-
venn(list(OTU.2w, OTU.8w, OTU.1yr))
-

-

These are not the prettiest Venns, but they are the quickest way to calculate the values within a Venn. Once you have these, you can use the VennDiagram or venneuler packages for more pretty graphing options. You can also save your OTU name lists and use them in an online Venn tool

-
-
-
-

Publication figures

-

Once you have a figure you want to include in a publication, there are a number of ways to export it out of R. You can use the “Export” function within the Plots window, but this often does not result in high enough resolution.

-

Here, we will use postscript to export at a specific resolution, size and font. This function uses

-
    -
  • width, height: in inches
  • -
  • horizontal: TRUE = landscape, FALSE = portrait
  • -
  • colormodel: RGB, CMYK, and others
  • -
  • family: Font to be used within figures
  • -
-

Then we add layout if we have more than one plot within the overall figure.

-
    -
  • matrix: -
      -
    • A list of how many figures there are. For 2, it is c(1,2). For 4, it is c(1,2,3,4)
    • -
    • Then the number of rows, columns the figures should be oriented in
    • -
  • -
  • widths: A list of scalars of how large each figure should be in width.
  • -
  • heights: A list of scalars of how large each figure should be in heigth.
  • -
-
postscript("Fig1.png", width = 6, height = 3, horizontal = FALSE, colormodel = "rgb", family = "ArialMT")
-
-layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1))
-
-plot(BC.nmds, type="n", main="Bray-Curtis")
-    points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-
-boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue"))
-
-dev.off()
-
## png 
-##   2
-
- - -
-
- - - -
-
- -
- - - - - - - - diff --git a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R_final.html b/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R_final.html deleted file mode 100755 index 10d0ef8..0000000 --- a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R_final.html +++ /dev/null @@ -1,2629 +0,0 @@ - - - - - - - - - - - - - - - -Microbiota Analysis in R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - -
-
-
-
-
- -
- - - - - - - -

Updated April 15, 2017

-

Online version available at http://rpubs.com/dillmcfarlan/R_microbiotaSOP

-
-

Tips for this workshop

-
    -
  1. If you have any issues in R, type ??command into the console where “command” is the function you are having issues with and a help page will come up.
  2. -
  3. Lines starting with # are comments that are for the reader’s benefit. These lines are not code and do not need to be entered into the console.
  4. -
  5. GREY boxes contain code that you can copy and paste to run on your machine.
  6. -
-
#GREY box
-
    -
  1. WHITE boxes contain sample output of this code, and nothing will happen if you try to copy it into your console.

    -
    WHITE box
  2. -
  3. Basic R code you may find useful: -
      -
    1. Matrices/data frames are designated by [ , ] where it is [rows, columns]
    2. -
    3. | is or
    4. -
    5. & is and
    6. -
  4. -
-
-
-

Introduction

-

Written for R v3.3.2 in RStudio v1.0.136

-
-

Goal

-

The goal of this tutorial is to demonstrate basic analyses of microbiota data to determine if and how communities differ by variables of interest. In general, this pipeline can be used for any microbiota data set that has been clustered into operational taxonomic units (OTUs).

-

This tutorial assumes some basic statistical knowledge. Please consider if your data fit the assumptions of each test (normality? equal sampling? Etc.). If you are not familiar with statistics at this level, we strongly recommend collaborating with someone who is. The incorrect use of statistics is a pervasive and serious problem in the sciences so don’t become part of the problem! That said, this is an introductory tutorial and there are many, many further analyses that can be done with microbiota data. Hopefully, this is just the start for your data!

-
-
-

Data

-

The data used here were created using 2x250 bp amplicon sequencing of the bacterial V4 region of the 16S rRNA gene on the Illumina MiSeq platform. The full data set is in Dill-McFarland et al. Sci Rep 7: 40864. Here, we will use a subset of samples. Specifically, we will be correlating the fecal bacterial microbiota of 8 dairy calves at different ages (2 weeks, 8 weeks, 1 year) to variables like weight gain (average daily gain in kg, ADGKG) and gastrointestinal short chain fatty acids (SCFA).

-
-
-

Files

-

We will use the following files created using the Microbiota Processing in mothur: Standard Operating Procedure (SOP).

-
    -
  • example.final.nn.unique_list.0.03.norm.shared (OTU table)
  • -
  • example.final.nn.unique_list.0.03.cons.taxonomy (Taxonomy of OTUs)
  • -
-

We will also be using tab-delimited metadata and SCFA files created in Excel. The metadata includes our metadata (like age and ADGKG) as well as alpha-diversity metrics from example.final.nn.unique_list.0.03.norm.groups.summary calculated in mothur. The SCFA table is the mM concentrations of different SCFAs in rumen (stomach) liquids from 1-year-old animals.

-
    -
  • example.metadata.txt
  • -
  • example.SCFA.txt
  • -
-

Finally, we will be loading a number of custom scripts from Steinberger_scripts and some a pre-calculated OTU tree NJ.tree.RData. The information for creating this tree is provided in this tutorial.

-
-
-
-

Get set up

-
-

Download and install

-
    -
  • Base R: http://cran.mtu.edu/
  • -
  • RStudio: https://www.rstudio.com/products/rstudio/download3/
  • -
  • Packages: Open RStudio on your computer. If you have not already downloaded these packages, go to the lower right quadrant of your screen and open the Package tab. Click “download” and search for the package you want to download. -
      -
    • ape
    • -
    • dplyr
    • -
    • ggplot2
    • -
    • gplots
    • -
    • lme4
    • -
    • phangorn
    • -
    • plotly
    • -
    • tidyr
    • -
    • vegan
    • -
    • VennDiagram
    • -
    • venneuler
    • -
    • phyloseq (phyloseq is not on CRAN, so we have to call it manually. See below.)
    • -
  • -
-

Copy and paste the following into your console.

-
source("https://bioconductor.org/biocLite.R")
-
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
-
biocLite("phyloseq")
-
## BioC_mirror: https://bioconductor.org
-
## Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.3 (2017-03-06).
-
## Installing package(s) 'phyloseq'
-
## package 'phyloseq' successfully unpacked and MD5 sums checked
-## 
-## The downloaded binary packages are in
-##  C:\Users\Kim\AppData\Local\Temp\RtmpWsR86d\downloaded_packages
-
## installation path not writeable, unable to update packages: cluster,
-##   lattice, survival
-
## Old packages: 'Biostrings', 'curl', 'IRanges', 'S4Vectors', 'XVector'
-

Note: If you are having trouble installing packages, turn off your computer’s firewall temporarily.

-
-
-

Organization

-

All of our analyses will be organized into a “Project”.

-

Make a new project by selecting File->New project. Select “New Directory” and “Empty Project”. Name the project “Microbiota_Analysis_BRC” and save the project to your Desktop. Place all of your files for this analysis in the folder created on the Desktop

-

Create a new R script (File->New file->R script) to save your code. This file will automatically be saved in the project folder.

-

Now your screen should look like this

-
    -
  • Upper left: Where you type and save the code you want to run.
  • -
  • Upper right: Files you load into and create in R. To view one, click on it and it will open in the upper left pane.
  • -
  • Lower left: The console. Where commands and outputs run (similar to the one mothur window).
  • -
  • Lower right: Variable. Explore the different tabs.
  • -
-
-
-
-

Data manipulation

-
-

Load Packages

-

The “library” command tells R to open the package you want to use. You need to do this every time you open R.

-
#Analyses of Phylogenetics and Evolution package. Required for tree calculations to be used with phyloseq
-library(ape)
-
-#This package will also help us more easily manipulate our data
-library(dplyr)
-
## 
-## Attaching package: 'dplyr'
-
## The following objects are masked from 'package:stats':
-## 
-##     filter, lag
-
## The following objects are masked from 'package:base':
-## 
-##     intersect, setdiff, setequal, union
-
#Graphing package used in phyloseq. To edit the default setting of a plot, you need to use functions in this package.
-library(ggplot2)
-
-#This package is used to calculate and plot Venn diagrams as well as heatmaps
-library(gplots)
-
## 
-## Attaching package: 'gplots'
-
## The following object is masked from 'package:stats':
-## 
-##     lowess
-
#Linear mixed-effects models like repeated measures analysis
-library(lme4)
-
## Loading required package: Matrix
-
#used to read in mothur-formatted files
-library(phangorn)
-
-#The phyloseq package seeks to address issues with multiple microbiome analysis packages by providing a set of functions that internally manage the organizing, linking, storing, and analyzing of phylogenetic sequencing data. In general, this package is used for UniFrac analyses.
-library(phyloseq)
-
-#A package to create interactive web graphics of use in 3D plots
-library(plotly)
-
## 
-## Attaching package: 'plotly'
-
## The following object is masked from 'package:ggplot2':
-## 
-##     last_plot
-
## The following object is masked from 'package:stats':
-## 
-##     filter
-
## The following object is masked from 'package:graphics':
-## 
-##     layout
-
#This package will help us more easily manipulate our data, which are matrices
-library(tidyr)
-
## 
-## Attaching package: 'tidyr'
-
## The following object is masked from 'package:Matrix':
-## 
-##     expand
-
#The vegan package provides tools for descriptive community ecology. It has most basic functions of diversity analysis, community ordination and dissimilarity analysis. In general, this package is used for Bray-Curtis and Jaccard analyses.
-library(vegan)
-
## Loading required package: permute
-
## Loading required package: lattice
-
## This is vegan 2.4-3
-
## 
-## Attaching package: 'vegan'
-
## The following objects are masked from 'package:phangorn':
-## 
-##     diversity, treedist
-
#Pretty Venn disgrams
-library(VennDiagram)
-
## Loading required package: grid
-
## Loading required package: futile.logger
-
## 
-## Attaching package: 'VennDiagram'
-
## The following object is masked from 'package:ape':
-## 
-##     rotate
-
library(venneuler)
-
## Loading required package: rJava
-
-
-

Load Data

-

In the code, the text before = is what the file will be called in R. Make this short but unique as this is how you will tell R to use this file in later commands.

-
    -
  • header: tells R that the first row is column names, not data
  • -
  • row.names: tells R that the first column is row names, not data
  • -
  • sep: tells R that the data are tab-delimited. If you had a comma-delimited file, you would us sep=","
  • -
-
#OTU table (shared file)
-OTU = read.table("example.final.an.unique_list.0.03.norm.shared", header=TRUE, sep="\t")
-
-#Taxonomy of each OTU
-tax = read.table("example.final.an.unique_list.0.03.cons.taxonomy", header=TRUE, sep="\t")
-
-#Metadata. Since we made this in Excel, not mothur, we can use the "row.names" modifier to automatically name the rows by the values in the first column (sample names)
-meta = read.table("example.metadata.txt", header=TRUE, row.names=1, sep="\t")
-
-#SCFA data
-SCFA = read.table("example.SCFA.txt", header=TRUE, row.names=1, sep="\t")
-
-
-

Clean up the data

-

You can look at your data by clicking on it in the upper-right quadrant “Environment”

-

There are several unneeded columns and incorrect formatting in the tables as they were output by mothur. We will now fix them.

-
-

OTU table

-

We need to use the “Group” column as the row names so that it will match our metadata

-
row.names(OTU) = OTU$Group
-

We then need to remove the “label”, “numOTUs”, and “Group” columns as they are not OTU counts like the rest of the table

-
OTU.clean = OTU[,-which(names(OTU) %in% c("label", "numOtus", "Group"))]
-
-
-

Taxonomy table

-

For the taxonomy table, we name the rows by the OTU #

-
row.names(tax) = tax$OTU
-

Remove all the OTUs that don’t occur in our OTU.clean data set

-
tax.clean = tax[row.names(tax) %in% colnames(OTU.clean),]
-

We then need to separate the “taxonomy” column so that each level (i.e. Domain, Phylum, etc) is in it’s own column. We do this with a special command “separate” from the tidyr package

-
tax.clean = separate(tax.clean, Taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain"), sep=";")
-

Finally, we remove the “Size” and “Strain” columns as well as “OTU” since these are now the row names

-
tax.clean = tax.clean[,-which(names(tax.clean) %in% c("Size", "Strain", "OTU"))]
-
-
-

Metadata and SCFA tables

-

These tables do not require any modification since I created them in Excel exactly as I need them for this R analysis.

-
-
-
-

Order the data

-

To make viewing and using the data easier, we will make sure our tables have samples (rows) in the same order. Since OTU.clean, meta, and SCFA have sample names as row names, we order by these.

-
OTU.clean = OTU.clean[order(row.names(OTU.clean)),]
-meta = meta[order(row.names(meta)),]
-SCFA = SCFA[order(row.names(SCFA)),]
-

Our taxonomy table is already in order from OTU1 to OTUN so we do not need to order it.

-
-
-

Set seed

-

We will be running some processes that rely on the random number generater. To make your analysis reproducible, we set the random seed.

-
set.seed(8765)
-
-
-
-

Alpha-diversity

-

Alpha-diversity is within sample diversity. It is how many different species (OTUs) are in each sample (richness) and how evenly they are distributed (evenness), which together are diversity. Each sample has one value for each metric.

-

This image illustrates richness vs. diversity. Both forests have the same richness (4 tree species) but Community 1 has much more even distribution of the 4 species while Community 2 is dominated by tree species A. This makes Community 1 more diverse than Community 2.

-
-

Explore alpha metrics

-

Now we will start to look at our data. We will first start with alpha-diversity and richness. Let’s plot some common ones here.

-
#Create 2x2 plot environment so that we can see all 4 metrics at once. 
-par(mfrow = c(2, 2))
-
-#Then plot each metric.
-hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10)
-hist(meta$simpson, main="Simpson diversity", xlab="", breaks=10)
-hist(meta$chao, main="Chao richness", xlab="", breaks=15)
-hist(meta$ace, main="ACE richness", xlab="", breaks=15)
-

-

You want the data to be roughly normal so that you can run ANOVA or t-tests. If it is not normally distributed, you will need to consider non-parametric tests such as Kruskal-Wallis.

-

Here, we see that none of the data are normally distributed. This occurs with the subset but not the full data set because I’ve specifically selected samples with divergent alpha metrics. In general, you will see roughly normal data for Shannon’s diversity as well as most richness metrics. Simpson’s diversity, on the other hand, is usually skewed as seen here.

-

So most will use inverse Simpson (1/Simpson) instead. This not only increases normalcy but also makes the output more logical as a higher inverse Simpson value corresponds to higher diversity.

-

Let’s look at inverse Simpson instead.

-
#Create 2x2 plot environment 
-par(mfrow = c(2, 2))
-
-#Plots
-hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10)
-hist(1/meta$simpson, main="Inverse Simpson diversity", xlab="", breaks=10)
-hist(meta$chao, main="Chao richness", xlab="", breaks=15)
-hist(meta$ace, main="ACE richness", xlab="", breaks=15)
-

-

Now we see a bimodal distribution for Simpson similar to the richness metrics.

-

To test for normalcy statistically, we can run the Shapiro-Wilk test of normality.

-
shapiro.test(meta$shannon)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$shannon
-## W = 0.91511, p-value = 0.0456
-
shapiro.test(1/meta$simpson)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  1/meta$simpson
-## W = 0.74821, p-value = 4.69e-05
-
shapiro.test(meta$chao)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$chao
-## W = 0.80636, p-value = 0.0003749
-
shapiro.test(meta$ace)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$ace
-## W = 0.83017, p-value = 0.0009573
-

We see that, as expected from the graphs, none are normal.

-

However, our sample size is small and normalcy tests are very sensitive for small data-sets. In fact, you can run Shapiro-Wilk on a list of 50 values randomly sampled from the R-generated normal distribution and find that they are not normal (even though we know that they are!)

-

So, what does this mean for our purposes? Well, we should run statistical tests that don’t assume our data is normal, because we don’t have any evidence (graphs, Shapiro-Wilk) that it is normal. For demonstration purposes, though, we will run other tests as well.

-

Overall, for alpha-diversity:

-
    -
  • ANOVA, t-test, or general linear models with the normal distribution are used when the data is roughly normal
  • -
  • Kruskal-Wallis, Wilcoxon rank sum test, or general linear models with another distribution are used when the data is not normal
  • -
-

Our main variables of interest are

-
    -
  • AgeGroup: 2w, 8w, 1yr
  • -
  • ADGKG: 0.05-1.56 kg gained per day (average daily gain kg)
  • -
-
-
-

Categorical variables

-

Now that we know which tests can be used, let’s run them.

-

Normally distributed metrics

-

Since it’s the closest to normalcy, we will use Shannon’s diversity as an example. First, we will test age, which is a categorical variable with more than 2 levels. Thus, we run ANOVA. If age were only two levels, we could run a t-test

-

Does age impact the Shannon diversity of the fecal microbiota?

-
#Run the ANOVA and save it as an object
-aov.shannon.age = aov(shannon ~ AgeGroup, data=meta)
-#Call for the summary of that ANOVA, which will include P-values
-summary(aov.shannon.age)
-
##             Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup     2  42.98  21.489   103.4 1.35e-11 ***
-## Residuals   21   4.36   0.208                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

To do all the pairwise comparisons between groups and correct for multiple comparisons, we run Tukey’s honest significance test of our ANOVA.

-
TukeyHSD(aov.shannon.age)
-
##   Tukey multiple comparisons of means
-##     95% family-wise confidence level
-## 
-## Fit: aov(formula = shannon ~ AgeGroup, data = meta)
-## 
-## $AgeGroup
-##             diff        lwr       upr   p adj
-## 2w-1yr -3.270063 -3.8446230 -2.695503 0.0e+00
-## 8w-1yr -1.830903 -2.4054628 -1.256342 2.0e-07
-## 8w-2w   1.439160  0.8646001  2.013720 8.5e-06
-

We clearly see that all age groups have significantly different diversity. When we plot the data, we see that diversity increases as the animals age.

-
#Re-order the groups because the default is 1yr-2w-8w
-meta$AgeGroup.ord = factor(meta$AgeGroup, c("2w","8w","1yr"))
-#Return the plot area to 1x1
-par(mfrow = c(1, 1))
-#Plot
-boxplot(shannon ~ AgeGroup.ord, data=meta, ylab="Shannon's diversity")
-

-

Non-normally distributed metrics

-

We will use Chao’s richness estimate here. Since age is categorical, we use Kruskal-Wallis (non-parametric equivalent of ANOVA). If we have only two levels, we would run Wilcoxon rank sum test (non-parametric equivalent of t-test)

-
kruskal.test(chao ~ AgeGroup, data=meta)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  chao by AgeGroup
-## Kruskal-Wallis chi-squared = 19.28, df = 2, p-value = 6.507e-05
-

We can test pairwise within the age groups with Wilcoxon Rank Sum Tests. This test has a slightly different syntax than our other tests

-
pairwise.wilcox.test(meta$chao, meta$AgeGroup, p.adjust.method="fdr")
-
## 
-##  Pairwise comparisons using Wilcoxon rank sum test 
-## 
-## data:  meta$chao and meta$AgeGroup 
-## 
-##    1yr     2w     
-## 2w 0.00023 -      
-## 8w 0.00023 0.00186
-## 
-## P value adjustment method: fdr
-

Like diversity, we see that richness also increases with age.

-
#Create 1x1 plot environment
-par(mfrow = c(1, 1))
-#Plot
-boxplot(chao ~ AgeGroup.ord, data=meta, ylab="Chao richness")
-

-
-
-

Continuous variables

-

For continuous variables, we use general linear models, specifying the distribution that best fits our data.

-

Normally distributed metrics

-

Since ADG is a continuous variable, we run a general linear model. We will again use Shannon’s diversity as our roughly normal metric. The default of glm and lm is the normal distribution so we don’t have to specify anything.

-

Does ADG impact the Shannon diversity of the fecal microbiota?

-
glm.shannon.ADG = glm(shannon ~ ADGKG, data=meta)
-summary(glm.shannon.ADG)
-
## 
-## Call:
-## glm(formula = shannon ~ ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##      Min        1Q    Median        3Q       Max  
-## -2.49110  -1.11216  -0.01749   1.53658   1.84728  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)   
-## (Intercept)  3.62565    1.01390   3.576  0.00169 **
-## ADGKG       -0.03407    0.97805  -0.035  0.97253   
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 2.151815)
-## 
-##     Null deviance: 47.343  on 23  degrees of freedom
-## Residual deviance: 47.340  on 22  degrees of freedom
-## AIC: 90.412
-## 
-## Number of Fisher Scoring iterations: 2
-

The output let’s us know that the intercept of our model is significantly different from 0 but our slope (e.g. our variable of interest) is not. This makes sense when we look at the data.

-
plot(shannon ~ ADGKG, data=meta)
-#Add the glm best fit line
-abline(glm.shannon.ADG)
-

-

Non-normally distributed metrics

-

We will again use a general linear model for our non-normally distributed metric Chao. However, this time, we change the distribution from normal to something that fits the data better.

-

But which distribution should we choose? In statistics, there is no one “best” model. There are only good and better models. We will use the plot() function to compare two models and pick the better one.

-

First, the Gaussian (normal) distribution, which we already know is a bad fit.

-
gaussian.chao.ADG = glm(chao ~ ADGKG, data=meta, family="gaussian")
-par(mfrow = c(1,2))
-plot(gaussian.chao.ADG, which=c(1,2))
-

-

Quasipoisson (log) distribution

-
qp.chao.ADG = glm(chao ~ ADGKG, data=meta, family="quasipoisson")
-par(mfrow = c(1,2))
-plot(qp.chao.ADG, which=c(1,2))
-

-

What we’re looking for is no pattern in the Residuals vs. Fitted graph (“stars in the sky”), which shows that we picked a good distribution family to fit our data. We also want our residuals to be normally distributed, which is shown by most/all of the points falling on the line in the Normal Q-Q plot.

-

While it’s still not perfect, the quasipoisson fits much better with residuals on the order of 30 whereas gaussian was on the order of 600. So, we will use quasipoisson and see that ADG does not to correlate to Chao richness.

-
summary(qp.chao.ADG)
-
## 
-## Call:
-## glm(formula = chao ~ ADGKG, family = "quasipoisson", data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -24.36  -17.05  -10.66   18.81   26.91  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)   6.4528     0.5561  11.605 7.54e-11 ***
-## ADGKG        -0.1859     0.5438  -0.342    0.736    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 374.2485)
-## 
-##     Null deviance: 8117.2  on 23  degrees of freedom
-## Residual deviance: 8074.4  on 22  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 5
-

Plotting this we see that, indeed, there is not signficant correlation between Chao and ADG.

-
#Return the plot area to 1x1
-par(mfrow = c(1, 1))
-#Plot
-plot(log(chao) ~ ADGKG, data=meta, ylab="ln(Chao's richness)")
-abline(qp.chao.ADG)
-

-
-
-

Mixed models

-

Our two variables may not be fully independent and therefore, running them in two separate tests may not be correct. That is to say, age may impact ADG. In fact, I know this is the case because calves (2w, 8w) gain weight more quickly than heifers (1yr).

-

Think about your variables and what they mean “in the real world.” Logically combine them into as few ANOVA tests as possible. In the end, it’s better to test a meaningless interaction than not test a meaningful one.

-

We can test if the interaction of age and ADG impacts diversity with a model that includes both of our variables. The * symbol is a shortcut for models. A*B is equivalent to A + B + A:B

-
aov.shannon.all = aov(shannon ~ AgeGroup*ADGKG, data=meta)
-summary(aov.shannon.all)
-
##                Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup        2  42.98  21.489  95.472 2.61e-10 ***
-## ADGKG           1   0.05   0.054   0.239    0.631    
-## AgeGroup:ADGKG  2   0.26   0.130   0.576    0.572    
-## Residuals      18   4.05   0.225                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

We can see that the interaction of age and ADG doesn’t significantly impact Shannon diversity, So we should remove that variable to simplify our model. If you had many interaction terms, you would step-wise remove the one with the highest P-value until you had the simplest model with only individual variables and significant interaction terms.

-
aov.shannon.all2 = aov(shannon ~ AgeGroup+ADGKG, data=meta)
-summary(aov.shannon.all2)
-
##             Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup     2  42.98  21.489   99.70 3.96e-11 ***
-## ADGKG        1   0.05   0.054    0.25    0.623    
-## Residuals   20   4.31   0.216                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Overall, the ANOVA test tells us that only age impacts Shannon diversity but it does not tell us which age groups differ from one another. If all of our variables were categorical, we could run TukeyHSD like we did with age only.

-
TukeyHSD(aov.shannon.all)
-
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
-## ADGKG
-
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
-## AgeGroup, ADGKG
-
## Warning in TukeyHSD.aov(aov.shannon.all): 'which' specified some non-
-## factors which will be dropped
-
##   Tukey multiple comparisons of means
-##     95% family-wise confidence level
-## 
-## Fit: aov(formula = shannon ~ AgeGroup * ADGKG, data = meta)
-## 
-## $AgeGroup
-##             diff       lwr       upr    p adj
-## 2w-1yr -3.270063 -3.875469 -2.664657 0.00e+00
-## 8w-1yr -1.830903 -2.436309 -1.225496 1.20e-06
-## 8w-2w   1.439160  0.833754  2.044567 2.81e-05
-

However, you will see that we don’t get any data from ADG since it is continuous. There is an error denoting this as “non-factors ignored: ADGKG”

-

So, we should have run our test as a glm since we have at least one continuous variable. First, we will still include the interaction variable to see that type of output.

-
glm.shannon.all = glm(shannon ~ AgeGroup*ADGKG, data=meta)
-summary(glm.shannon.all)
-
## 
-## Call:
-## glm(formula = shannon ~ AgeGroup * ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##     Min       1Q   Median       3Q      Max  
-## -1.0301  -0.2468   0.0894   0.1572   0.7624  
-## 
-## Coefficients:
-##                  Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)        5.7123     2.5928   2.203   0.0409 *
-## AgeGroup2w        -3.3969     2.6197  -1.297   0.2111  
-## AgeGroup8w        -2.9610     2.7554  -1.075   0.2967  
-## ADGKG             -0.4481     2.7599  -0.162   0.8728  
-## AgeGroup2w:ADGKG   0.1228     2.7848   0.044   0.9653  
-## AgeGroup8w:ADGKG   1.0750     2.8763   0.374   0.7130  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 0.22508)
-## 
-##     Null deviance: 47.3425  on 23  degrees of freedom
-## Residual deviance:  4.0514  on 18  degrees of freedom
-## AIC: 39.413
-## 
-## Number of Fisher Scoring iterations: 2
-

Now this output is saying the same thing as ANOVA but in a more complicated way. The function automatically picks a reference group for categorical variables (in this case, 1yr) to compare all other groups to. Let’s go through each line

-
    -
  • (Intercept) - This is whether or not the y-intercept is 0. A significant P-value indicates that the intercept is not 0, and we wouldn’t expect it to be for any alpha-diversity metric since 0 means nothing is there

  • -
  • AgeGroup2w - the difference between Shannon when Age = 2w vs. 1yr (the same as testing “shannon ~ AgeGroup” and only looking at the 2w-1yr pairwise comparison)
  • -
  • AgeGroup8w - the same as 2w but now looking at only the 8w-1yr comparison

  • -
  • ADGKG - the slope of Shannon to ADGKG (the same as testing “shannon ~ ADGKG”)

  • -
  • AgeGroup2w:ADGKG - the difference in slope of shannon ~ ADG between ages 2w and 1yr
  • -
  • AgeGroup8w:ADGKG - the difference in slope of shannon ~ ADG between ages 8w and 1yr

  • -
-

As we saw in ANOVA, none of the interaction terms are significant so we remove them.

-
glm.shannon.all2 = glm(shannon ~ AgeGroup+ADGKG, data=meta)
-summary(glm.shannon.all2)
-
## 
-## Call:
-## glm(formula = shannon ~ AgeGroup + ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##      Min        1Q    Median        3Q       Max  
-## -0.95299  -0.25858   0.07643   0.30409   0.74487  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)   5.4459     0.3487  15.619 1.14e-12 ***
-## AgeGroup2w   -3.2760     0.2324 -14.094 7.55e-12 ***
-## AgeGroup8w   -1.7989     0.2408  -7.471 3.30e-07 ***
-## ADGKG        -0.1639     0.3281  -0.500    0.623    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 0.2155447)
-## 
-##     Null deviance: 47.3425  on 23  degrees of freedom
-## Residual deviance:  4.3109  on 20  degrees of freedom
-## AIC: 36.903
-## 
-## Number of Fisher Scoring iterations: 2
-

Note: The full glm model with the interaction term included did not show age as significant. When we remove the interaction term, age is significant. This is why you should remove non-significant interactions terms as they can the mask main effects of individual variables.

-

We can run a similar test with non-normal data like Chao.

-
qp.chao.all = glm(chao ~ AgeGroup*ADGKG, data=meta, family="quasipoisson")
-summary(qp.chao.all)
-
## 
-## Call:
-## glm(formula = chao ~ AgeGroup * ADGKG, family = "quasipoisson", 
-##     data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -7.774  -3.430  -0.140   3.692   5.277  
-## 
-## Coefficients:
-##                  Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)       6.99825    0.71122   9.840 1.14e-08 ***
-## AgeGroup2w       -1.61539    0.75272  -2.146   0.0458 *  
-## AgeGroup8w       -2.24498    0.86846  -2.585   0.0187 *  
-## ADGKG             0.01751    0.75699   0.023   0.9818    
-## AgeGroup2w:ADGKG -0.42295    0.80094  -0.528   0.6039    
-## AgeGroup8w:ADGKG  0.86269    0.86550   0.997   0.3321    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 18.86331)
-## 
-##     Null deviance: 8117.2  on 23  degrees of freedom
-## Residual deviance:  348.5  on 18  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 4
-

Remove the non-significant interaction.

-
qp.chao.all2 = glm(chao ~ AgeGroup+ADGKG, data=meta, family="quasipoisson")
-summary(qp.chao.all2)
-
## 
-## Call:
-## glm(formula = chao ~ AgeGroup + ADGKG, family = "quasipoisson", 
-##     data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -7.783  -3.452  -1.378   3.744   8.184  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)  7.03944    0.23567  29.870  < 2e-16 ***
-## AgeGroup2w  -1.98090    0.14862 -13.329 2.08e-11 ***
-## AgeGroup8w  -1.24286    0.11926 -10.422 1.57e-09 ***
-## ADGKG       -0.02643    0.24530  -0.108    0.915    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 23.74583)
-## 
-##     Null deviance: 8117.20  on 23  degrees of freedom
-## Residual deviance:  476.31  on 20  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 4
-
-
-

Repeated measure

-

Another thing to consider with this data is the fact that we sampled the same animals over time. So, we have a repeated measures design. There are a number of ways to do repeated measures in R. I personally like the lme4 package used here.

-

We add the repeated measure component by adding a random effect for the individual animals with (1|Animal) in the lmer function.

-
rm.shannon.all = lmer(shannon ~ AgeGroup+ADGKG + (1|Animal), data=meta)
-summary(rm.shannon.all)
-
## Linear mixed model fit by REML ['lmerMod']
-## Formula: shannon ~ AgeGroup + ADGKG + (1 | Animal)
-##    Data: meta
-## 
-## REML criterion at convergence: 32.4
-## 
-## Scaled residuals: 
-##      Min       1Q   Median       3Q      Max 
-## -1.83117 -0.45932  0.09539  0.49972  1.53368 
-## 
-## Random effects:
-##  Groups   Name        Variance Std.Dev.
-##  Animal   (Intercept) 0.03793  0.1948  
-##  Residual             0.17819  0.4221  
-## Number of obs: 24, groups:  Animal, 8
-## 
-## Fixed effects:
-##             Estimate Std. Error t value
-## (Intercept)   5.3906     0.3520  15.313
-## AgeGroup2w   -3.2739     0.2114 -15.486
-## AgeGroup8w   -1.8104     0.2208  -8.201
-## ADGKG        -0.1049     0.3321  -0.316
-## 
-## Correlation of Fixed Effects:
-##            (Intr) AgGrp2 AgGrp8
-## AgeGroup2w -0.350              
-## AgeGroup8w -0.027  0.461       
-## ADGKG      -0.884  0.057 -0.293
-

We see that very little of the variance in the data is explained by the animal random effects (0.03793). So we actually don’t need to include repeated measures in our final model, but it was necessary to check!

-

From all of this, we can conclude that the fecal microbiota increases in diversity and richness as dairy cows age. Animal growth as measured by ADG does not correlate with fecal community diversity or richness.

-
-
-
-

Beta-diversity

-

Beta-diversity is between sample diversity. It is how different every sample is from every other sample. Thus, each sample has more than one value. Some metrics take abundance into account (i.e. diversity: Bray-Curtis, weighted UniFrac) and some only calculate based on presence-absence (i.e. richness: Jaccard, unweighted UniFrac).

-

Beta-diversity appears like the following (completely made-up numbers)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
.sample1sample2sample3
sample100.3450.194
sample20.34500.987
sample30.1940.9870
-
-

Visualization

-

The best way to visualize beta-diversity, or how different samples are from each other, is by non-metric multidimensional scaling (nMDS). This is similar to principle coordinate analysis or PCA/PCoA if you’ve heard of that, only nMDS is more statistically robust with multiple iterations in the form of the trymax part of the command.

-

Each symbol on an nMDS plot represents the total microbial community of that sample. Symbols closer together have more similar microbiotas while those farther apart have less similar.

-
-

OTU-based metrics

-

There are two main type of beta-diversity measures. These OTU-based metrics treat every OTU as a separate entity without taking taxonomy into account. The distance between Prevotella OTU1 and Prevotella OTU2 is equivalent to the distance between Prevotella OTU1 and Bacteroides OTU1.

-
-

Dot plots

-

First, we calculate the nMDS values for a 2-axis k=2 graph using the OTU-based Bray-Curtis metric that takes into account both the presence/absence and abundance of OTUs in your samples (i.e. diversity). This uses the metaMDS function from the package vegan.

-
BC.nmds = metaMDS(OTU.clean, distance="bray", k=2, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.06208161 
-## Run 1 stress 0.06210668 
-## ... Procrustes: rmse 0.001636313  max resid 0.005662513 
-## ... Similar to previous best
-## Run 2 stress 0.06208261 
-## ... Procrustes: rmse 0.0008174643  max resid 0.00186259 
-## ... Similar to previous best
-## Run 3 stress 0.06208133 
-## ... New best solution
-## ... Procrustes: rmse 0.000495613  max resid 0.001143981 
-## ... Similar to previous best
-## Run 4 stress 0.06208228 
-## ... Procrustes: rmse 0.0002768028  max resid 0.0006083455 
-## ... Similar to previous best
-## Run 5 stress 0.06208254 
-## ... Procrustes: rmse 0.0003377152  max resid 0.0007457908 
-## ... Similar to previous best
-## Run 6 stress 0.06208233 
-## ... Procrustes: rmse 0.000285801  max resid 0.000626649 
-## ... Similar to previous best
-## Run 7 stress 0.06210685 
-## ... Procrustes: rmse 0.001453303  max resid 0.005539077 
-## ... Similar to previous best
-## Run 8 stress 0.062104 
-## ... Procrustes: rmse 0.001430176  max resid 0.005147467 
-## ... Similar to previous best
-## Run 9 stress 0.06208351 
-## ... Procrustes: rmse 0.0005018534  max resid 0.00111944 
-## ... Similar to previous best
-## Run 10 stress 0.06208269 
-## ... Procrustes: rmse 0.0003614257  max resid 0.0008024269 
-## ... Similar to previous best
-## Run 11 stress 0.06208154 
-## ... Procrustes: rmse 0.0004861021  max resid 0.001120926 
-## ... Similar to previous best
-## Run 12 stress 0.06212707 
-## ... Procrustes: rmse 0.001859292  max resid 0.005339963 
-## ... Similar to previous best
-## Run 13 stress 0.3702005 
-## Run 14 stress 0.06210406 
-## ... Procrustes: rmse 0.001425256  max resid 0.00512563 
-## ... Similar to previous best
-## Run 15 stress 0.06208142 
-## ... Procrustes: rmse 3.189023e-05  max resid 6.612762e-05 
-## ... Similar to previous best
-## Run 16 stress 0.06210429 
-## ... Procrustes: rmse 0.001578454  max resid 0.005195898 
-## ... Similar to previous best
-## Run 17 stress 0.06210796 
-## ... Procrustes: rmse 0.00155285  max resid 0.005626229 
-## ... Similar to previous best
-## Run 18 stress 0.06208191 
-## ... Procrustes: rmse 0.0001981339  max resid 0.0004391198 
-## ... Similar to previous best
-## Run 19 stress 0.06208168 
-## ... Procrustes: rmse 0.0001331311  max resid 0.000291077 
-## ... Similar to previous best
-## Run 20 stress 0.06210592 
-## ... Procrustes: rmse 0.001396183  max resid 0.005412384 
-## ... Similar to previous best
-## *** Solution reached
-

We see that we reached a convergent solution around 20 iterations and our stress is very low (0.06), meaning that 2-axis are sufficient to view the data.

-

Then plot the nMDS with different colors for your different groups of interest. We will use colors for our three ages

-
par(mfrow = c(1, 1))
-#Create a blank plot for the nmds
-plot(BC.nmds, type="n", main="Bray-Curtis")
-#Add the points colored by age
-points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-#Add a legend
-legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

This will create a plot in the lower right quadrant. If you want to get fancy, type “?plot” in the console to see other ways to modify the plot function.

-

A similar thing can be done for the Jaccard metric, which only takes into account presence/absence (i.e. richness).

-
J.nmds = metaMDS(OTU.clean, distance="jaccard", k=2, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.0620818 
-## Run 1 stress 0.06208178 
-## ... New best solution
-## ... Procrustes: rmse 0.0007016851  max resid 0.001623036 
-## ... Similar to previous best
-## Run 2 stress 0.06210633 
-## ... Procrustes: rmse 0.001409348  max resid 0.005467011 
-## ... Similar to previous best
-## Run 3 stress 0.06210745 
-## ... Procrustes: rmse 0.001470069  max resid 0.00557513 
-## ... Similar to previous best
-## Run 4 stress 0.06208144 
-## ... New best solution
-## ... Procrustes: rmse 0.0001309513  max resid 0.0002717662 
-## ... Similar to previous best
-## Run 5 stress 0.06208156 
-## ... Procrustes: rmse 5.349512e-05  max resid 0.0001195792 
-## ... Similar to previous best
-## Run 6 stress 0.06208137 
-## ... New best solution
-## ... Procrustes: rmse 2.027381e-05  max resid 4.710602e-05 
-## ... Similar to previous best
-## Run 7 stress 0.06208345 
-## ... Procrustes: rmse 0.0004560942  max resid 0.001010311 
-## ... Similar to previous best
-## Run 8 stress 0.06210681 
-## ... Procrustes: rmse 0.001448074  max resid 0.005531499 
-## ... Similar to previous best
-## Run 9 stress 0.06208334 
-## ... Procrustes: rmse 0.0004470347  max resid 0.000984174 
-## ... Similar to previous best
-## Run 10 stress 0.06208155 
-## ... Procrustes: rmse 7.705878e-05  max resid 0.0001651192 
-## ... Similar to previous best
-## Run 11 stress 0.06208217 
-## ... Procrustes: rmse 0.0002412108  max resid 0.0005340427 
-## ... Similar to previous best
-## Run 12 stress 0.06210429 
-## ... Procrustes: rmse 0.001420012  max resid 0.005133791 
-## ... Similar to previous best
-## Run 13 stress 0.06208263 
-## ... Procrustes: rmse 0.0002884997  max resid 0.0006395557 
-## ... Similar to previous best
-## Run 14 stress 0.06208166 
-## ... Procrustes: rmse 0.0001135875  max resid 0.0002424163 
-## ... Similar to previous best
-## Run 15 stress 0.06210651 
-## ... Procrustes: rmse 0.001438738  max resid 0.005503184 
-## ... Similar to previous best
-## Run 16 stress 0.06208137 
-## ... New best solution
-## ... Procrustes: rmse 6.516686e-05  max resid 0.0001605969 
-## ... Similar to previous best
-## Run 17 stress 0.06208244 
-## ... Procrustes: rmse 0.0002976643  max resid 0.0007159927 
-## ... Similar to previous best
-## Run 18 stress 0.06208222 
-## ... Procrustes: rmse 0.0002618419  max resid 0.0006358936 
-## ... Similar to previous best
-## Run 19 stress 0.06208197 
-## ... Procrustes: rmse 0.000208525  max resid 0.0005678922 
-## ... Similar to previous best
-## Run 20 stress 0.0620832 
-## ... Procrustes: rmse 0.0004189108  max resid 0.0009707012 
-## ... Similar to previous best
-## *** Solution reached
-
plot(J.nmds, type="n", main="Jaccard")
-points(J.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-3, 1.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

You see that the values are very different for Jaccard but the pattern of points is very similar to Bray-Curtis. This is because Jaccard is a transformation of Bray-Curtis with J = 2BC/(1+BC)

-
-
-

Ellipses

-

You can also plot standard error (se) ellipses for your nmds data instead of showing all of the individual points. Here, we will plot 99% confidence se ellipses for the Bray-Curtis metric using ordiellipse from vegan.

-

Code courtesy of Madison Cox.

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-
-#Add an ellipse for 2w
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-
-#Add an ellipse for 8w
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-
-#Add an ellipse for 1yr
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

We clearly see in both the dot and ellipse plots that age significantly impacts the overall structure (Bray-Curtis) and composition (Jaccard) of the fecal bacterial microbiota.

-
-
-

3D plots

-

If your stress is high (like over 0.3) for your metaMDS calculation, you probably need to increase to 3 axes k=3. Graphing a 3D plot is much more complicated, and there are a number of packages that could be used. Here, we will use one option from the plotly package to visualize a 3D Bray-Curtis plot.

-
#Calculate the Bray-Curtis nMDS for 3-axis
-BC.nmds.3D = metaMDS(OTU.clean, distance="bray", k=3, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.04686346 
-## Run 1 stress 0.04741659 
-## Run 2 stress 0.04673425 
-## ... New best solution
-## ... Procrustes: rmse 0.01073904  max resid 0.0344814 
-## Run 3 stress 0.05061835 
-## Run 4 stress 0.04740131 
-## Run 5 stress 0.04984642 
-## Run 6 stress 0.04747801 
-## Run 7 stress 0.05226505 
-## Run 8 stress 0.05295437 
-## Run 9 stress 0.04741387 
-## Run 10 stress 0.0457586 
-## ... New best solution
-## ... Procrustes: rmse 0.03868237  max resid 0.1296728 
-## Run 11 stress 0.05094992 
-## Run 12 stress 0.04719303 
-## Run 13 stress 0.05012352 
-## Run 14 stress 0.04750204 
-## Run 15 stress 0.0479423 
-## Run 16 stress 0.04579561 
-## ... Procrustes: rmse 0.004692476  max resid 0.01495666 
-## Run 17 stress 0.05069634 
-## Run 18 stress 0.0485804 
-## Run 19 stress 0.05058189 
-## Run 20 stress 0.04859459 
-## Run 21 stress 0.04996713 
-## Run 22 stress 0.04740079 
-## Run 23 stress 0.04747632 
-## Run 24 stress 0.04675455 
-## Run 25 stress 0.04747574 
-## Run 26 stress 0.0486171 
-## Run 27 stress 0.04575823 
-## ... New best solution
-## ... Procrustes: rmse 0.0005374711  max resid 0.0008831403 
-## ... Similar to previous best
-## *** Solution reached
-

Extract x-y-z values for this nmds

-
BCxyz = scores(BC.nmds.3D, display="sites")
-#This is a table that looks like 
-BCxyz
-
##                 NMDS1       NMDS2        NMDS3
-## 5017.1yr.F -4.7973931  0.33029806 -0.211481225
-## 5017.2w.F   3.1867260  0.06208276  1.484970505
-## 5017.8w.F   1.0614871 -2.13025264 -1.218243774
-## 5020.1yr.F -4.7579235  0.24440345 -0.002888360
-## 5020.2w.F   3.4979230 -1.00981047  1.015200903
-## 5020.8w.F   1.5897780 -1.93435391  0.464128291
-## 5026.1yr.F -4.7720517  0.20611823  0.214815994
-## 5026.2w.F   3.3976411  1.10010056 -0.616957559
-## 5026.8w.F   3.1483050  2.07715934  1.478767471
-## 5031.1yr.F -4.8021402  0.44250394  0.202447638
-## 5031.2w.F   3.3537430  0.48376070 -1.490408346
-## 5031.8w.F   0.8577869 -1.64300786  0.250766536
-## 5037.1yr.F -4.8522745  0.48898068 -0.004218580
-## 5037.2w.F   3.6593056  0.26886383 -0.507062657
-## 5037.8w.F   3.1326413 -0.82210579 -0.024946820
-## 5041.1yr.F -4.7724198  0.28335210  0.060469429
-## 5041.2w.F   3.1661815  2.43615798 -1.218459457
-## 5041.8w.F   1.0947996 -2.58325770 -0.236659085
-## 5045.1yr.F -4.7522029  0.16444286  0.004405471
-## 5045.2w.F   1.5110480  3.11956405 -0.469494555
-## 5045.8w.F   1.4900615 -2.17087166 -0.450930039
-## 5053.1yr.F -4.8259682  0.39929033 -0.016428020
-## 5053.2w.F   3.2932453  2.30299477  0.813801957
-## 5053.8w.F   0.8917011 -2.11641360  0.478404284
-

Plot the xyz coordinates and color by age

-
plot_ly(x=BCxyz[,1], y=BCxyz[,2], z=BCxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red"))
-
- -

Note: Since 3D plots are difficult to interpret in printed journal articles, many authors choose to create two separate 2D plots to show the 3D data like so.

-
par(mfrow=c(1,2))
-#Axis 1 and 2 (x and y)
-plot(BCxyz[,1], BCxyz[,2], main="Bray-Curtis 1:2", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-5.4, 3, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Axis 1 and 3 (x and z)
-plot(BCxyz[,1], BCxyz[,3], main="Bray-Curtis 1:3", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-

-
-
-
-

Phylogentic-based metrics

-

The most common of this type of beta-diversity metrics is UniFrac. The strength of UniFrac over Bray-Curtis or Jaccard is that it takes into account phylogenetic relationships of the species present in the microbiota. Thus, samples with different OTUs from the same genus will be more similar by UniFrac that those with OTUs from different genera. The weakness is that UniFrac is more sensitive to low abundance OTUs and those that a very phylogenetically distant.

-

Your choice will depend on how much you personally feel phylogenetic relationships vs. sensitively matter in your data.

-

Just as above, UniFrac can be plotted as an nMDS. You just need to use a different R package, and thus, slightly different commands.

-
-

Create physeq object

-

To start, you must make a phyloseq object which includes the OTU.clean, meta, and tax.clean data. We tell R which tables are each type

-
OTU.UF = otu_table(as.matrix(OTU.clean), taxa_are_rows=FALSE)
-tax.UF = tax_table(as.matrix(tax.clean))
-meta.UF = sample_data(meta)
-

We then merge these into an object of class phyloseq.

-
physeq = phyloseq(OTU.UF, tax.UF, meta.UF)
-

To add the phylogenetic component to UniFrac, we calculate a rooted phylogenetic tree of our OTUs. This takes a long time so we have provided the tree for you.

-

However, if we were to calculate a tree, first, we import a distance matrix created from representative sequences of our OTUs. We would use phangorn to read the file as it was created in mothur as seen under “Trees of OTUs” here.

-

DO NOT RUN THIS

-
dist.mat = import_mothur_dist("clean_repFasta.phylip.dist")
-

We would then calculate a rooted neighbor-joining tree from the distance matrix using the ape package.

-

DO NOT RUN THIS

-
NJ.tree = bionj(dist.mat)
-

Instead, we have pre-calculated this tree and you can load is with

-
load("NJ.tree.Rdata")
-

Then, add this tree to your physeq object. This object will be what is used in UniFrac calculations.

-
physeq.tree = merge_phyloseq(physeq, NJ.tree)
-

We can look at this object and see its components.

-
physeq.tree
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 5002 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 5002 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 5002 tips and 5000 internal nodes ]
-
-
-

Dot plots

-

Calculate weighted UniFrac (i.e. diversity) distances and ordinate into an nMDS. We specify weighted with weighted=TRUE.

-
wUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=TRUE)
-
## Warning in UniFrac(physeq, ...): Randomly assigning root as -- Otu00062 --
-## in the phylogenetic tree in the data you provided.
-
## Run 0 stress 0.0864543 
-## Run 1 stress 0.08645377 
-## ... New best solution
-## ... Procrustes: rmse 0.0001213931  max resid 0.0003141587 
-## ... Similar to previous best
-## Run 2 stress 0.1335727 
-## Run 3 stress 0.1463023 
-## Run 4 stress 0.08645329 
-## ... New best solution
-## ... Procrustes: rmse 0.0007206919  max resid 0.001920389 
-## ... Similar to previous best
-## Run 5 stress 0.1270238 
-## Run 6 stress 0.1157455 
-## Run 7 stress 0.1143571 
-## Run 8 stress 0.1317677 
-## Run 9 stress 0.08645345 
-## ... Procrustes: rmse 5.804039e-05  max resid 0.0001620988 
-## ... Similar to previous best
-## Run 10 stress 0.08808605 
-## Run 11 stress 0.08645348 
-## ... Procrustes: rmse 0.000642139  max resid 0.001706552 
-## ... Similar to previous best
-## Run 12 stress 0.1157451 
-## Run 13 stress 0.0864534 
-## ... Procrustes: rmse 4.051435e-05  max resid 0.0001125382 
-## ... Similar to previous best
-## Run 14 stress 0.1143564 
-## Run 15 stress 0.08659435 
-## ... Procrustes: rmse 0.004251655  max resid 0.01804703 
-## Run 16 stress 0.1295296 
-## Run 17 stress 0.0864538 
-## ... Procrustes: rmse 0.000161137  max resid 0.0004585026 
-## ... Similar to previous best
-## Run 18 stress 0.1347981 
-## Run 19 stress 0.08645297 
-## ... New best solution
-## ... Procrustes: rmse 0.0003657154  max resid 0.0008934259 
-## ... Similar to previous best
-## Run 20 stress 0.08808625 
-## *** Solution reached
-

You can plot UniFrac nMDS using the basic plot function as we’ve done before.

-
par(mfrow=c(1,1))
-plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(0.3,0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

But let’s also look at the ggplot2 package. This package is incredibly powerful and can be customized in many ways. This document has many helpful tips.

-
plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  ggtitle("Weighted UniFrac")
-

-

Unweighted UniFrac (i.e. richness) can be visualized in the same way. We specify unweighted with weighted=FALSE.

-
uwUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=FALSE)
-
## Warning in UniFrac(physeq, ...): Randomly assigning root as -- Otu00541 --
-## in the phylogenetic tree in the data you provided.
-
## Run 0 stress 9.695153e-05 
-## Run 1 stress 9.657832e-05 
-## ... New best solution
-## ... Procrustes: rmse 7.750783e-05  max resid 0.0002776914 
-## ... Similar to previous best
-## Run 2 stress 9.871795e-05 
-## ... Procrustes: rmse 8.086551e-05  max resid 0.0002819207 
-## ... Similar to previous best
-## Run 3 stress 9.488623e-05 
-## ... New best solution
-## ... Procrustes: rmse 7.261501e-05  max resid 0.0002642816 
-## ... Similar to previous best
-## Run 4 stress 9.862006e-05 
-## ... Procrustes: rmse 1.701217e-05  max resid 5.025527e-05 
-## ... Similar to previous best
-## Run 5 stress 9.806631e-05 
-## ... Procrustes: rmse 0.0001070473  max resid 0.0002353732 
-## ... Similar to previous best
-## Run 6 stress 9.757454e-05 
-## ... Procrustes: rmse 3.985665e-05  max resid 0.0001388531 
-## ... Similar to previous best
-## Run 7 stress 9.826177e-05 
-## ... Procrustes: rmse 9.722135e-05  max resid 0.0002191936 
-## ... Similar to previous best
-## Run 8 stress 9.695708e-05 
-## ... Procrustes: rmse 7.448687e-05  max resid 0.0002751687 
-## ... Similar to previous best
-## Run 9 stress 9.907648e-05 
-## ... Procrustes: rmse 9.310993e-05  max resid 0.0002388289 
-## ... Similar to previous best
-## Run 10 stress 9.984534e-05 
-## ... Procrustes: rmse 3.384419e-05  max resid 0.0001260377 
-## ... Similar to previous best
-## Run 11 stress 9.684607e-05 
-## ... Procrustes: rmse 0.0001319037  max resid 0.0003356478 
-## ... Similar to previous best
-## Run 12 stress 9.69891e-05 
-## ... Procrustes: rmse 8.404145e-06  max resid 2.447679e-05 
-## ... Similar to previous best
-## Run 13 stress 0.0002969569 
-## ... Procrustes: rmse 0.0003866364  max resid 0.0006715474 
-## ... Similar to previous best
-## Run 14 stress 9.723199e-05 
-## ... Procrustes: rmse 3.731826e-05  max resid 0.0001336343 
-## ... Similar to previous best
-## Run 15 stress 9.99257e-05 
-## ... Procrustes: rmse 0.0001270356  max resid 0.0003614341 
-## ... Similar to previous best
-## Run 16 stress 9.955355e-05 
-## ... Procrustes: rmse 6.056256e-05  max resid 0.0001673759 
-## ... Similar to previous best
-## Run 17 stress 9.589429e-05 
-## ... Procrustes: rmse 1.686683e-05  max resid 4.596185e-05 
-## ... Similar to previous best
-## Run 18 stress 9.633493e-05 
-## ... Procrustes: rmse 3.660483e-05  max resid 0.0001324208 
-## ... Similar to previous best
-## Run 19 stress 9.921893e-05 
-## ... Procrustes: rmse 1.085938e-05  max resid 1.669484e-05 
-## ... Similar to previous best
-## Run 20 stress 9.637055e-05 
-## ... Procrustes: rmse 6.450683e-05  max resid 0.0001970587 
-## ... Similar to previous best
-## *** Solution reached
-
## Warning in metaMDS(ps.dist): Stress is (nearly) zero - you may have
-## insufficient data
-
plot_ordination(physeq.tree, uwUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  ggtitle("Unweighted UniFrac")
-

-
-
-

Ellipses

-

Ellipses can be plotted instead of points as well. With the basic plot function:

-
plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
legend(0.3, 0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-
-#Add an ellipse for 2w
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-
-#Add an ellipse for 8w
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-
-#Add an ellipse for 1yr
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

We can also plot ellipses in ggplot2. However, these ellipses are not the exact same at the standard error ellipses used with OTU-based metrics as they use different underlying calculations. However, they get at the same question of confidence intervals for groups of points on an nMDS.

-

We plot ellipses with ggplot2 by adding the stat_ellipse function to our plot.

-
plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  stat_ellipse() + 
-  ggtitle("Weighted UniFrac")
-

-
-
-

3D plots

-

3D UniFrac ordinations are not currently supported by phyloseq. We see that our ordinations only include 2 dimensions.

-
wUF.ordu
-
## 
-## Call:
-## metaMDS(comm = ps.dist) 
-## 
-## global Multidimensional Scaling using monoMDS
-## 
-## Data:     ps.dist 
-## Distance: user supplied 
-## 
-## Dimensions: 2 
-## Stress:     0.08645297 
-## Stress type 1, weak ties
-## Two convergent solutions found after 20 tries
-## Scaling: centring, PC rotation 
-## Species: scores missing
-

But we can instead calculate UniFrac distances using UniFrac and ordinating for 3-axes with metaMDS.

-
wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE)
-
## Warning in UniFrac(physeq.tree, weighted = TRUE, normalized = TRUE):
-## Randomly assigning root as -- Otu03194 -- in the phylogenetic tree in the
-## data you provided.
-
wUF.nmds.3D = metaMDS(wUF.dist, method="NMDS", k=3)
-
## Run 0 stress 0.04217486 
-## Run 1 stress 0.05952471 
-## Run 2 stress 0.05952709 
-## Run 3 stress 0.042174 
-## ... New best solution
-## ... Procrustes: rmse 0.0003317483  max resid 0.0007893038 
-## ... Similar to previous best
-## Run 4 stress 0.04217542 
-## ... Procrustes: rmse 0.0005403913  max resid 0.0014387 
-## ... Similar to previous best
-## Run 5 stress 0.0421741 
-## ... Procrustes: rmse 0.0001810271  max resid 0.000555628 
-## ... Similar to previous best
-## Run 6 stress 0.05952602 
-## Run 7 stress 0.04217451 
-## ... Procrustes: rmse 0.0003976044  max resid 0.001227917 
-## ... Similar to previous best
-## Run 8 stress 0.06815104 
-## Run 9 stress 0.05952564 
-## Run 10 stress 0.04217457 
-## ... Procrustes: rmse 0.0004479109  max resid 0.001435945 
-## ... Similar to previous best
-## Run 11 stress 0.04217428 
-## ... Procrustes: rmse 0.0003207273  max resid 0.0009212836 
-## ... Similar to previous best
-## Run 12 stress 0.04217476 
-## ... Procrustes: rmse 0.0004904995  max resid 0.001357519 
-## ... Similar to previous best
-## Run 13 stress 0.04217443 
-## ... Procrustes: rmse 0.0003308483  max resid 0.0008748533 
-## ... Similar to previous best
-## Run 14 stress 0.04217414 
-## ... Procrustes: rmse 0.0002102509  max resid 0.000611423 
-## ... Similar to previous best
-## Run 15 stress 0.04217491 
-## ... Procrustes: rmse 0.0005257634  max resid 0.001791904 
-## ... Similar to previous best
-## Run 16 stress 0.04217454 
-## ... Procrustes: rmse 0.0003986916  max resid 0.001121447 
-## ... Similar to previous best
-## Run 17 stress 0.04217553 
-## ... Procrustes: rmse 0.0004447142  max resid 0.001546131 
-## ... Similar to previous best
-## Run 18 stress 0.04217399 
-## ... New best solution
-## ... Procrustes: rmse 0.0001824097  max resid 0.0005684325 
-## ... Similar to previous best
-## Run 19 stress 0.04217406 
-## ... Procrustes: rmse 7.68744e-05  max resid 0.0001772352 
-## ... Similar to previous best
-## Run 20 stress 0.04217417 
-## ... Procrustes: rmse 0.0001240512  max resid 0.0002862878 
-## ... Similar to previous best
-## *** Solution reached
-

Then, similar to what we did with Bray-Curtis/Jaccard, we pull out the xyz values and plot with plotly.

-
wUFxyz = scores(wUF.nmds.3D, display="sites")
-#This is a table that looks like 
-wUFxyz
-
##                  NMDS1        NMDS2       NMDS3
-## 5017.1yr.F -0.19591424  0.107765310  0.07968290
-## 5017.2w.F   0.40329083  0.187040546 -0.11891085
-## 5017.8w.F  -0.06738145  0.046058811 -0.21927277
-## 5020.1yr.F -0.21311918  0.100813200  0.06833139
-## 5020.2w.F  -0.02918765 -0.163606283 -0.02929884
-## 5020.8w.F   0.03375300  0.054503745 -0.09099989
-## 5026.1yr.F -0.22482781  0.066613100  0.05594134
-## 5026.2w.F   0.13241677 -0.217029557  0.08745439
-## 5026.8w.F   0.38996273  0.135464299  0.24011205
-## 5031.1yr.F -0.19996967  0.080398029  0.09445703
-## 5031.2w.F   0.19084848 -0.256852240  0.01563640
-## 5031.8w.F  -0.13587208 -0.042300350 -0.02591350
-## 5037.1yr.F -0.21800838  0.076413856  0.07189119
-## 5037.2w.F   0.05187202 -0.120151694 -0.04223782
-## 5037.8w.F   0.14227112 -0.115591151 -0.01897721
-## 5041.1yr.F -0.20911338  0.081709200  0.07441520
-## 5041.2w.F   0.27813371 -0.237693762  0.03647625
-## 5041.8w.F  -0.13928666 -0.001531998 -0.18656755
-## 5045.1yr.F -0.23328251  0.051043269  0.06274834
-## 5045.2w.F   0.49259170  0.294540193 -0.14634317
-## 5045.8w.F  -0.16902451 -0.126094687 -0.13841874
-## 5053.1yr.F -0.21539833  0.077884489  0.08008741
-## 5053.2w.F   0.27502987 -0.030380383  0.17559141
-## 5053.8w.F  -0.13978439 -0.049015941 -0.12588496
-
plot_ly(x=wUFxyz[,1], y=wUFxyz[,2], z=wUFxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red"))
-
- -
-
-
-

Vectors for continuous variables

-

While it is easy to visualize categorical groups with coloring in nMDS, it is difficult to achieve the same effect with continuous variables. Instead, we can fit these variables as a vector on our nMDS plots.

-

To do this, we first fit the variables to our distances using the envfit function in vegan. You can do Bray-Curtis, Jaccard, weighted or unweighted UniFrac. Here, we will demonstrate with Bray-Curtis and weighted UniFrac.

-
fit.BC = envfit(BC.nmds, meta) 
-fit.BC
-
## 
-## ***VECTORS
-## 
-##             NMDS1    NMDS2     r2 Pr(>r)    
-## AgeExact -0.99887 -0.04744 0.9765  0.001 ***
-## ADGKG     0.12503  0.99215 0.0770  0.444    
-## chao     -0.98567  0.16868 0.9599  0.001 ***
-## shannon  -0.69400  0.71997 0.9469  0.001 ***
-## simpson   0.42087 -0.90712 0.7353  0.001 ***
-## ace      -0.99746  0.07129 0.9078  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##                   NMDS1   NMDS2
-## Animalcow5017   -0.1841  0.5449
-## Animalcow5020    0.0059  0.6577
-## Animalcow5026    0.4243 -0.8826
-## Animalcow5031   -0.2442  0.1175
-## Animalcow5037    0.4946 -0.0566
-## Animalcow5041    0.0500 -0.0290
-## Animalcow5045   -0.1374 -0.3384
-## Animalcow5053   -0.4090 -0.0134
-## AgeGroup1yr     -4.4470 -0.1800
-## AgeGroup2w       2.5047 -1.0509
-## AgeGroup8w       1.9422  1.2309
-## AgeGroup.ord2w   2.5047 -1.0509
-## AgeGroup.ord8w   1.9422  1.2309
-## AgeGroup.ord1yr -4.4470 -0.1800
-## 
-## Goodness of fit:
-##                  r2 Pr(>r)    
-## Animal       0.0248  0.997    
-## AgeGroup     0.9134  0.001 ***
-## AgeGroup.ord 0.9134  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

We see that it has automatically fit every variable in our meta table.

-

The simplest way around this is to just ask envfit to run on only the variables you want.

-
fit.BC = envfit(BC.nmds, meta[,c("AgeGroup", "ADGKG")])
-fit.BC
-
## 
-## ***VECTORS
-## 
-##         NMDS1   NMDS2    r2 Pr(>r)
-## ADGKG 0.12503 0.99215 0.077  0.452
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##               NMDS1   NMDS2
-## AgeGroup1yr -4.4470 -0.1800
-## AgeGroup2w   2.5047 -1.0509
-## AgeGroup8w   1.9422  1.2309
-## 
-## Goodness of fit:
-##              r2 Pr(>r)    
-## AgeGroup 0.9134  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

We repeat for weighted UniFrac

-
fit.wUF = envfit(wUF.ordu, meta[,c("AgeGroup", "ADGKG")])
-fit.wUF
-
## 
-## ***VECTORS
-## 
-##          NMDS1    NMDS2     r2 Pr(>r)
-## ADGKG -0.17846  0.98395 0.0398   0.66
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##               NMDS1   NMDS2
-## AgeGroup1yr -0.1076 -0.0834
-## AgeGroup2w   0.1432  0.0322
-## AgeGroup8w  -0.0356  0.0511
-## 
-## Goodness of fit:
-##              r2 Pr(>r)    
-## AgeGroup 0.5588  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

For categorical variables, envfit will label the centroid of the data for each group in the nMDS with that group’s name. For continuous variables, it adds an arrow in the direction from smallest to largest value.

-

Note: The P-values for variables in envfit are not equivalent to the P-values for our ANOVA/Kruskal/GLM tests. Instead, envfit P-values tell you how well the arrow or centroids fit the x-y data of the nMDS, not the underlying distance matrix. In general, if your nMDS is a good representation of the data (low stress value) and the variable was significant in its appropriate ANOVA/Kruskal/GLM test, the fitted arrow/centroids will also be significant. And if your nMDS is a good representation of the data and the variable was not significant, the fitted arrow/centroids will also not be significant. We see this type of result here, but this will not always be the case.

-

However, if your nMDS stress was borderline or not great and/or your variable was borderline significant or not, you may see divergent results for the arrow/centroid. This does not mean that the result you got in ANOVA/Kruskal/GLM was invalid. It just means that it’s difficult to visualize this result as a simple arrow or centroids on a 2D plot. Regardless, non-significant variables in envfit that you know are signficant in other tests may still be represented on an nMDS as a visual aid.

-

Thus, we plot our 2D nMDS colored by age with an arrow for the ADG variable even though that arrow was not significant. Since the ADG variable was also not significant in GLM, we probably won’t use these plot in a publication, but it is good practice.

-

For Bray-Curtis:

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC, col="black")
-

-

You could also ask it to only plot variables with a fit P-value < 0.05. So we would only see the centroids

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC, col="black", p.max=0.05)
-

-

Weighted UniFrac

-
plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(.3,.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.wUF, col="black")
-

-

You could also fit your OTU.clean table to the nMDS to add arrow(s) for specific OTUs within the plot. OTU arrows that, say, go in the same direction as an age group centroid tend to increase in abundance in that age group. The opposite direction would indicate that an OTU decreases in abundance in that age group.

-

Fitting all OTUs would take awhile so we will only fit the first 10 in our table.

-
fit.BC.OTU = envfit(BC.nmds, OTU.clean[,1:10])
-fit.BC.OTU
-
## 
-## ***VECTORS
-## 
-##             NMDS1    NMDS2     r2 Pr(>r)    
-## Otu00001  0.71738 -0.69668 0.2478  0.033 *  
-## Otu00002  0.46984 -0.88275 0.2109  0.057 .  
-## Otu00003  0.25719 -0.96636 0.2503  0.021 *  
-## Otu00004  0.25006  0.96823 0.2738  0.030 *  
-## Otu00005  0.15473  0.98796 0.2910  0.003 ** 
-## Otu00006 -0.96867  0.24837 0.6743  0.001 ***
-## Otu00007  0.17991 -0.98368 0.2488  0.009 ** 
-## Otu00008  0.40157  0.91583 0.3108  0.016 *  
-## Otu00009  0.26275 -0.96487 0.1894  0.062 .  
-## Otu00010  0.33868 -0.94090 0.1552  0.078 .  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-
#We will only plot significant arrows in this case
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC.OTU, col="black", p.max=0.05)
-

-

You could also think about plotting higher taxonomic levels like summed genera or family groups of OTUs.

-
#Extract all OTUs within the genus Ruminococcus
-OTU.Rumino = OTU.clean[,tax.clean$Genus == "g__Ruminococcus"]
-#Sum the abundances of the Ruminococcaceae OTUs into one variable (column)
-OTU.Rumino$Rumino.sum = rowSums(OTU.Rumino)
-
-#Fit the new Ruminococcaceae group
-fit.BC.Rumino = envfit(BC.nmds, OTU.Rumino$Rumino.sum)
-fit.BC.Rumino
-
## 
-## ***VECTORS
-## 
-##         NMDS1    NMDS2     r2 Pr(>r)    
-## [1,] -0.14506  0.98942 0.6621  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-
#Plot
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC.Rumino, col="black", labels=c("Ruminococcus"))
-

-
-
-
-

Statistically test beta-diversity

-

While nMDS gives us a visual of beta-diversity, it does not test for statistical differences. We do this with permutational analysis of variance (PERMANOVA) or analysis of similarity (ANOSIM). These test whether the overall microbial community differs by your variable of interest.

-

You can run them with Bray-Curtis, Jaccard, weighted or unweighted UniFrac to answer different questions. For example, if your variable is significant for Bray-Curtis/weighted UniFrac but not Jaccard/unweighted UniFrac, this means your groups tend to have the same OTUs (richness) but different abundances of those OTUs (diversity). When variables are signficant for Bray-Curtis/Jaccard but not UniFrac, this indicates that your samples have different specific OTUs but similar taxa. Like group 1 has a lot of Prevotella OTU1 and group 2 has a lot of Prevotella OTU2, but they are both Prevotella so UniFrac treats them as being very similar.

-
-

PERMANOVA

-

For Bray-Curtis or Jaccard, we use the vegan package to calculate distances and run PERMANOVA. As with ANOVA/glm of alpha-diversity, we want to include all variables that could interact in one model.

-

Note: adonis cannot handle or account for NA or blanks in your data. Subset to only samples with complete metadata before running vegdist if these exist.

-
#Calculate distance and save as a matrix
-BC.dist=vegdist(OTU.clean, distance="bray")
-#Run PERMANOVA on distances.
-adonis(BC.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = BC.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.9720 1.98600  8.0116 0.44481 0.000999 ***
-## ADGKG           1    0.1979 0.19791  0.7984 0.02216 0.618382    
-## AgeGroup:ADGKG  2    0.2976 0.14881  0.6003 0.03333 0.929071    
-## Residuals      18    4.4620 0.24789         0.49969             
-## Total          23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Similarly for Jaccard

-
J.dist=vegdist(OTU.clean, distance="jaccard")
-adonis(J.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = J.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.9720 1.98600  8.0116 0.44481 0.000999 ***
-## ADGKG           1    0.1979 0.19791  0.7984 0.02216 0.632368    
-## AgeGroup:ADGKG  2    0.2976 0.14881  0.6003 0.03333 0.920080    
-## Residuals      18    4.4620 0.24789         0.49969             
-## Total          23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

We see that the interaction is not significant so we remove it.

-
adonis(BC.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = BC.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.9720 1.98600  8.3451 0.44481 0.000999 ***
-## ADGKG      1    0.1979 0.19791  0.8316 0.02216 0.616384    
-## Residuals 20    4.7597 0.23798         0.53302             
-## Total     23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
adonis(J.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = J.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.9720 1.98600  8.3451 0.44481 0.000999 ***
-## ADGKG      1    0.1979 0.19791  0.8316 0.02216 0.566434    
-## Residuals 20    4.7597 0.23798         0.53302             
-## Total     23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

For UniFrac, we use the phyloseq package to calculate distances and then vegan to run PERMANOVA.

-
wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE)
-
## Warning in UniFrac(physeq.tree, weighted = TRUE, normalized = TRUE):
-## Randomly assigning root as -- Otu00842 -- in the phylogenetic tree in the
-## data you provided.
-
adonis(wUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = wUF.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2   0.71682 0.35841  7.6290 0.43422 0.000999 ***
-## ADGKG           1   0.03281 0.03281  0.6984 0.01988 0.665335    
-## AgeGroup:ADGKG  2   0.05553 0.02777  0.5910 0.03364 0.871129    
-## Residuals      18   0.84564 0.04698         0.51226             
-## Total          23   1.65080                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
uwUF.dist = UniFrac(physeq.tree, weighted=FALSE, normalized=TRUE)
-
## Warning in UniFrac(physeq.tree, weighted = FALSE, normalized = TRUE):
-## Randomly assigning root as -- Otu01729 -- in the phylogenetic tree in the
-## data you provided.
-
adonis(uwUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = uwUF.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.4956 1.74781  9.1479 0.46952 0.000999 ***
-## ADGKG           1    0.2434 0.24343  1.2741 0.03270 0.218781    
-## AgeGroup:ADGKG  2    0.2669 0.13344  0.6984 0.03585 0.832168    
-## Residuals      18    3.4391 0.19106         0.46193             
-## Total          23    7.4450                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Remove non-significant interaction term

-
adonis(wUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = wUF.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2   0.71682 0.35841  7.9543 0.43422 0.000999 ***
-## ADGKG      1   0.03281 0.03281  0.7282 0.01988 0.626374    
-## Residuals 20   0.90117 0.04506         0.54590             
-## Total     23   1.65080                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
adonis(uwUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = uwUF.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.4956 1.74781  9.4324 0.46952 0.000999 ***
-## ADGKG      1    0.2434 0.24343  1.3137 0.03270 0.206793    
-## Residuals 20    3.7060 0.18530         0.49778             
-## Total     23    7.4450                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
-
-

ANOSIM

-

If you have very different group sizes, you may consider analysis of similarities (ANOSIM) instead of PERMANOVA. This test does not assume equal group variances. However, it only allows simple 1 variable models with no interactions and can only be used for categorical (AgeGroup), not continuous (ADG) variables. So, ANOSIM has a lot of limitations and should only be used if you group sizes are very, very different, like 10 vs 100.

-

For example, Bray-Curtis:

-
anosim(BC.dist, meta$AgeGroup, permutations = 1000)
-
## 
-## Call:
-## anosim(dat = BC.dist, grouping = meta$AgeGroup, permutations = 1000) 
-## Dissimilarity: bray 
-## 
-## ANOSIM statistic R: 0.8467 
-##       Significance: 0.000999 
-## 
-## Permutation: free
-## Number of permutations: 1000
-

Overall, from the nMDS of various beta-diversity metrics (OTU- and phylogenetic-based) and statistical analyses, it is clear that age significantly impacts the fecal microbiota of dairy cows.

-
-
-

2D variables

-

These analyses are for comparing the microbiota to metadata that cannot fit in a single column and therefore, must be represented as a matrix of its own. For example, PERMANOVA can only tell you that the microbiota differs according to a single short chain fatty acid (SCFA), but other tests can tell you that the microbiota differs according to the overall SCFA profile. This section is also useful for comparing data if you have multiple OTU tables, like for bacteria, archaea, and fungi.

-

Mantel from vegan tests if two distance matrices co-vary e.g. does the data in matrix 1 change in the same way as the data in matrix 2. Like PERMANOVA, this test only tells you that the overall data co-vary, not which specific OTUs or SCFAs matter.

-

You can only compare samples were you have both types of data so we must subset our OTU table to only the samples that we also have SCFA for. The names are a little different between the tables so we also add “.F” to the SCFA names to make them match

-
OTU.SCFA = OTU.clean[row.names(OTU.clean) %in% paste(row.names(SCFA), ".F", sep=""),]
-

We then calculate distance matrices separately for each matrix. It is not necessary to do Bray-Curtis, Jaccard and UniFrac here since our SCFA data does not have any taxonomy to it.

-
dist1 = vegdist(OTU.SCFA)
-dist2 = vegdist(SCFA)
-

Run a Mantel test comparing the 2 matrices.

-
mantel(dist1, dist2, permutations=100)
-
## 'nperm' >= set of all permutations: complete enumeration.
-
## Set of permutations < 'minperm'. Generating entire set.
-
## 
-## Mantel statistic based on Pearson's product-moment correlation 
-## 
-## Call:
-## mantel(xdis = dist1, ydis = dist2, permutations = 100) 
-## 
-## Mantel statistic r: -0.02423 
-##       Significance: 0.54167 
-## 
-## Upper quantiles of permutations (null model):
-##   90%   95% 97.5%   99% 
-## 0.540 0.552 0.596 0.629 
-## Permutation: free
-## Number of permutations: 23
-

We see that the overall OTU table and SCFA tables do not co-vary.

-

You can also run Mantel on 3 matrices at once like so

-

Do not run as we do not have 3 matrices here

-
mantel.partial(dist1, dist2, dist3, permutations=100)
-
-
-
-

Beta dispersion

-

Sometimes it will be clear from nMDS that one group tends to vary more (be more spread out) than another group. You can test this statistically with multivariate homogeneity of group dispersion (variances).

-

Here is an example for Bray-Curtis. We use the same distance matrix we calculated for PERMANOVA/ANOSIM

-

Calculate dispersion (variances) within each group.

-
disp.age = betadisper(BC.dist, meta$AgeGroup)
-

Perform an ANOVA-like test to determine if the variances differ by groups.

-
permutest(disp.age, pairwise=TRUE, permutations=1000)
-
## 
-## Permutation test for homogeneity of multivariate dispersions
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Response: Distances
-##           Df  Sum Sq  Mean Sq     F N.Perm   Pr(>F)    
-## Groups     2 0.47459 0.237293 30.93   1000 0.000999 ***
-## Residuals 21 0.16111 0.007672                          
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## Pairwise comparisons:
-## (Observed p-value below diagonal, permuted p-value above diagonal)
-##            1yr         2w     8w
-## 1yr            9.9900e-04 0.0010
-## 2w  4.8556e-06            0.7902
-## 8w  1.2886e-06 7.7206e-01
-

Combining this with our plot,

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-legend(.6,-2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

we see that 2 week and 8 week calves have similar variability in their fecal microbiotas but that both 2- and 8-week calves have more variable fecal microbiotas than 1-year heifers.

-
-
-
-

OTUs that differ by

-
-

Categorical variables

-

Just because the overall microbiota does or does not differ between age groups, does not mean specific OTUs do or don’t differ by age. However, it is inadvisable to just test all OTUs in your data set against all variables of interest. Since you are running multiple similar tests, you need to apply a false discovery rate (fdr) correctios and correcting across all OTUs (5002 in this data set) will most likely result in no significant results after fdr correction. Also, you don’t want to look at over 5000 P-values, do you?

-

There are a number of way to decrease the number of OTUs you’re looking at

-
    -
  1. Don’t use OTUs. Add together genus or family groups and test if all or some of these taxa differ across variables of interest
  2. -
  3. Apply an abundance cutoff such as only looking at OTUs/taxa that are at least 1% abundance in at least one sample
  4. -
  5. Apply a frequency cutoff such as only looking at OTUs/taxa that occur in at least 50% of samples
  6. -
  7. Combine 2 and 3
  8. -
-

However, some of these methods are somewhat arbitrary. How do you pick an abundance or frequency cutoff? What if a low abundant OTU is of interest? And what if you are interested in possible species-level differences (OTUs) so high taxonomic levels aren’t useful?

-

So, one way to non-arbitrarily select OTUs/taxa of interest is similarity percentages (SIMPER). SIMPER identifies the OTUs that most contribute to beta-diversity measures. These OTUs are the most abundant and/or most variable OTUs in the data set. Note: SIMPER outputs all pairwise comparisons (A-B, B-C, A-C, etc.) and thus, only works for categorical variables.

-

SIMPER’s output is a list of OTUs which cumulatively explain 70%+ of the variation between each comparison. The numbers below the OTUs are cumulative, so to get each OTU’s contribution, you must subtract the previous OTU’s value.

-

For example

-
simper(OTU.clean, meta$AgeGroup, permutations=100)
-
## cumulative contributions of most influential species:
-## 
-## $`1yr_2w`
-##  Otu00002  Otu00001  Otu00003  Otu00007  Otu00011  Otu00006  Otu00009 
-## 0.0983761 0.1627191 0.2225335 0.2657879 0.2982889 0.3271508 0.3514210 
-##  Otu00014  Otu00022  Otu00018  Otu00012  Otu00016  Otu00004  Otu00021 
-## 0.3660756 0.3793171 0.3924608 0.4048922 0.4171422 0.4283988 0.4385280 
-##  Otu00008  Otu00025  Otu00028  Otu00023  Otu00037  Otu00013  Otu00035 
-## 0.4479076 0.4565849 0.4646081 0.4723795 0.4790690 0.4857141 0.4920793 
-##  Otu00055  Otu00030  Otu00036  Otu00040  Otu00042  Otu00010  Otu00049 
-## 0.4983615 0.5045449 0.5106265 0.5166717 0.5226378 0.5274331 0.5321886 
-##  Otu00046  Otu00033  Otu00031  Otu00081  Otu00051  Otu00064  Otu00056 
-## 0.5368030 0.5413764 0.5458188 0.5500936 0.5543565 0.5582465 0.5620674 
-##  Otu00032  Otu00052  Otu00062  Otu00026  Otu00020  Otu00074  Otu00069 
-## 0.5657989 0.5695078 0.5730822 0.5765920 0.5799406 0.5831741 0.5864067 
-##  Otu00066  Otu00077  Otu00148  Otu00073  Otu00067  Otu00065  Otu00076 
-## 0.5895953 0.5927428 0.5958511 0.5989588 0.6020549 0.6051241 0.6081334 
-##  Otu00075  Otu00091  Otu00048  Otu00097  Otu00068  Otu00050  Otu00084 
-## 0.6111073 0.6140400 0.6169121 0.6196512 0.6223697 0.6250661 0.6277023 
-##  Otu00100  Otu00019  Otu00063  Otu00039  Otu00086  Otu00071  Otu00101 
-## 0.6303356 0.6329664 0.6355752 0.6381709 0.6406744 0.6431362 0.6455850 
-##  Otu00089  Otu00096  Otu00095  Otu00108  Otu00088  Otu00103  Otu00094 
-## 0.6480310 0.6504700 0.6528884 0.6553007 0.6576757 0.6600472 0.6624184 
-##  Otu00098  Otu00116  Otu00090  Otu00105  Otu00104  Otu00099  Otu00059 
-## 0.6647575 0.6670589 0.6693444 0.6716046 0.6738590 0.6760506 0.6781917 
-##  Otu00106  Otu00115  Otu00102  Otu00110  Otu00119  Otu00118  Otu00034 
-## 0.6803196 0.6824245 0.6844633 0.6865021 0.6884972 0.6904775 0.6924261 
-##  Otu00114  Otu00093  Otu00124  Otu00045 
-## 0.6943714 0.6962690 0.6981558 0.7000319 
-## 
-## $`1yr_8w`
-##   Otu00001   Otu00005   Otu00006   Otu00004   Otu00010   Otu00017 
-## 0.03765603 0.07335078 0.10010930 0.12226268 0.14087762 0.15688502 
-##   Otu00008   Otu00009   Otu00015   Otu00018   Otu00016   Otu00014 
-## 0.17205091 0.18718833 0.20107546 0.21456235 0.22713556 0.23964967 
-##   Otu00029   Otu00019   Otu00021   Otu00025   Otu00024   Otu00037 
-## 0.25102468 0.26162658 0.27202671 0.28093293 0.28829315 0.29516652 
-##   Otu00035   Otu00044   Otu00055   Otu00027   Otu00036   Otu00040 
-## 0.30170335 0.30821052 0.31465848 0.32109529 0.32733731 0.33354206 
-##   Otu00042   Otu00020   Otu00013   Otu00041   Otu00003   Otu00043 
-## 0.33966556 0.34564370 0.35158279 0.35717451 0.36261926 0.36799345 
-##   Otu00038   Otu00026   Otu00034   Otu00049   Otu00070   Otu00046 
-## 0.37334038 0.37836130 0.38334135 0.38822230 0.39310161 0.39783775 
-##   Otu00012   Otu00058   Otu00011   Otu00051   Otu00054   Otu00045 
-## 0.40234701 0.40670755 0.41102172 0.41521298 0.41939306 0.42353985 
-##   Otu00047   Otu00064   Otu00056   Otu00052   Otu00048   Otu00002 
-## 0.42764688 0.43163954 0.43556497 0.43937178 0.44313291 0.44683135 
-##   Otu00062   Otu00031   Otu00057   Otu00061   Otu00053   Otu00074 
-## 0.45050368 0.45405112 0.45759807 0.46109474 0.46455875 0.46787762 
-##   Otu00069   Otu00066   Otu00077   Otu00073   Otu00067   Otu00079 
-## 0.47119548 0.47447192 0.47770248 0.48089214 0.48406988 0.48721802 
-##   Otu00083   Otu00078   Otu00076   Otu00075   Otu00091   Otu00121 
-## 0.49033806 0.49342871 0.49651735 0.49956976 0.50257978 0.50549547 
-##   Otu00097   Otu00092   Otu00032   Otu00084   Otu00129   Otu00050 
-## 0.50830678 0.51111612 0.51389884 0.51660098 0.51922111 0.52181856 
-##   Otu00100   Otu00101   Otu00096   Otu00108   Otu00095   Otu00086 
-## 0.52434751 0.52686095 0.52936793 0.53184756 0.53429667 0.53674109 
-##   Otu00089   Otu00088   Otu00103   Otu00094   Otu00098   Otu00116 
-## 0.53918547 0.54162316 0.54405719 0.54649097 0.54889172 0.55125394 
-##   Otu00105   Otu00104   Otu00143   Otu00123   Otu00082   Otu00039 
-## 0.55357747 0.55589135 0.55819397 0.56049152 0.56278380 0.56503978 
-##   Otu00099   Otu00130   Otu00090   Otu00106   Otu00107   Otu00115 
-## 0.56728918 0.56953083 0.57176616 0.57395024 0.57611979 0.57828018 
-##   Otu00087   Otu00153   Otu00102   Otu00110   Otu00119   Otu00118 
-## 0.58042631 0.58252590 0.58461849 0.58671108 0.58875879 0.59079874 
-##   Otu00022   Otu00072   Otu00080   Otu00093   Otu00124   Otu00112 
-## 0.59281824 0.59481609 0.59678509 0.59873275 0.60067308 0.60260107 
-##   Otu00122   Otu00131   Otu00132   Otu00134   Otu00128   Otu00125 
-## 0.60450552 0.60639869 0.60828362 0.61014314 0.61199594 0.61383412 
-##   Otu00133   Otu00159   Otu00139   Otu00127   Otu00114   Otu00137 
-## 0.61566158 0.61747930 0.61928689 0.62106367 0.62282385 0.62455846 
-##   Otu00136   Otu00194   Otu00138   Otu00144   Otu00142   Otu00135 
-## 0.62629042 0.62801571 0.62974033 0.63143945 0.63312281 0.63480281 
-##   Otu00147   Otu00120   Otu00188   Otu00126   Otu00028   Otu00211 
-## 0.63647550 0.63814069 0.63980299 0.64140642 0.64300322 0.64457174 
-##   Otu00154   Otu00146   Otu00173   Otu00156   Otu00158   Otu00157 
-## 0.64612078 0.64764950 0.64917769 0.65068721 0.65217234 0.65364696 
-##   Otu00060   Otu00168   Otu00140   Otu00163   Otu00171   Otu00113 
-## 0.65508066 0.65651008 0.65793253 0.65931862 0.66069801 0.66207484 
-##   Otu00178   Otu00200   Otu00165   Otu00170   Otu00164   Otu00187 
-## 0.66344999 0.66480785 0.66616041 0.66748648 0.66881018 0.67012189 
-##   Otu00151   Otu00213   Otu00149   Otu00183   Otu00192   Otu00167 
-## 0.67141176 0.67269928 0.67397558 0.67525135 0.67652371 0.67778788 
-##   Otu00177   Otu00181   Otu00180   Otu00236   Otu00186   Otu00199 
-## 0.67904574 0.68029263 0.68151160 0.68272731 0.68393783 0.68512983 
-##   Otu00253   Otu00150   Otu00204   Otu00169   Otu00218   Otu00189 
-## 0.68632029 0.68750539 0.68867418 0.68982822 0.69097221 0.69210846 
-##   Otu00182   Otu00184   Otu00226   Otu00270   Otu00172   Otu00225 
-## 0.69323878 0.69436709 0.69548866 0.69660494 0.69770318 0.69878699 
-##   Otu00185   Otu00203 
-## 0.69986670 0.70093653 
-## 
-## $`2w_8w`
-##  Otu00002  Otu00001  Otu00003  Otu00007  Otu00009  Otu00005  Otu00011 
-## 0.1101390 0.1804133 0.2466786 0.2952479 0.3351854 0.3745198 0.4100899 
-##  Otu00004  Otu00010  Otu00017  Otu00008  Otu00012  Otu00015  Otu00022 
-## 0.4397781 0.4641945 0.4818672 0.4987872 0.5154942 0.5307997 0.5454777 
-##  Otu00029  Otu00013  Otu00019  Otu00020  Otu00028  Otu00006  Otu00023 
-## 0.5580145 0.5704325 0.5824230 0.5910912 0.5996473 0.6081657 0.6166261 
-##  Otu00024  Otu00027  Otu00031  Otu00044  Otu00030  Otu00041  Otu00043 
-## 0.6247348 0.6322130 0.6396626 0.6468237 0.6539027 0.6600291 0.6659522 
-##  Otu00038  Otu00032  Otu00026  Otu00070  Otu00033  Otu00034  Otu00047 
-## 0.6718453 0.6776585 0.6834157 0.6887933 0.6940870 0.6992933 0.7044391
-

We see a number of OTUs that may differ between 1 or more age comparisons. However, these are just the OTUs that most contribute to Bray-Curtis measures between our age groups. They are not necessarily significantly different.

-

To test significance, we compare the relative abundance of an OTU across our age groups with Kruskal-Wallis (OTU abundance is never normally distributed, trust me). For example, OTU1 occurs in all SIMPER age comparisons and does, in fact, significantly differ by age.

-
kruskal.test(OTU.clean$Otu00001 ~ meta$AgeGroup)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  OTU.clean$Otu00001 by meta$AgeGroup
-## Kruskal-Wallis chi-squared = 15.994, df = 2, p-value = 0.0003364
-

In contrast, OTU17 occurs in SIMPER but does not actually significantly differ by age group

-
kruskal.test(OTU.clean$Otu00017 ~ meta$AgeGroup)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  OTU.clean$Otu00017 by meta$AgeGroup
-## Kruskal-Wallis chi-squared = 4.9767, df = 2, p-value = 0.08305
-

Note: These P-values have not been corrected from false discovery rate (fdr) yet.

-

Now, it would be very tedious to individually test every variable of interest in SIMPER and then test every SIMPER OTU in Kruskal-Wallis. So, Andrew Steinberger (Suen lab) has written two scripts to simplify both SIMPER and Kruskal-Wallis of SIMPER OTUs. The latest versions can be found on his GitHub page and we have provided them for this workshop in /Steinberger_scripts

-

Disclaimer Andrew has provided these scripts out of the goodness of his heart and provides no guarentee that they will work for your exact data set or with new versions of R/RStudio/vegan. You may contact him through GitHub with issues or errors, but it is not his job to troubleshoot for you. He may or may not address your concerns in an updated version of the scripts at a later time.

-

The use of these scripts are as follows (from Steinberger GitHub with some modifications)

-

simper_pretty.R

-

This script is meant to rapidly perform the SIMPER function from the R package vegan for all comparisons of interest in a data set. Inputs are OTU and metadata tables, and the output is a .csv. User can tailor contents of .csv by setting perc_cutoff, low_cutoff, and low_val. This function can also handle taxonomic levels instead of OTU, but currently only select formats are compatible. Requires installation of the R package ‘vegan’.

-

Usage:

-

simper.pretty(x, metrics, c(‘interesting’), perc_cutoff=0.5, low_cutoff = ‘y’, low_val=0.01, ‘output_name’)

-

Inputs:

-
    -
  • x: OTU table
  • -
  • metrics: metadata table
  • -
  • interesting: a list of the column headers for the columns of interest in the metrics file. e.g. c(‘int1’,‘int2’,‘int3’)
  • -
  • perc_cutoff: % cutoff for output OTUs, as decimal (i.e. write 50% as 0.5), larger % increases number OTUs in output.
  • -
  • low_cutoff: ‘y’ if want to REMOVE OTUs that contribute less than 1%
  • -
  • low_val: set value of low cutoff (0.01), ignored if low_cutoff=‘n’.
  • -
  • output_name: the name that is appended to the output filename “_clean_simper.csv“.
  • -
-

R_krusk.R

-

This script takes the output .csv of simper_pretty.R, and the OTU/metadata/taxonomy tables, and performs the non-parametric Kruskal-Wallis rank-sum test on each OTU in the .csv file. Output is a .csv file containing the same contents of simper.pretty output with the following info: p-value, fdr corrected p-value, OTU taxonomic classification (if applicable), mean rel. abund and std dev of otu/tax_lvl in group 1 of comparison, and mean rel. abund and std dev of otu/tax_lvl in group 2 of comparison. Requires installation of R packages ‘vegan’ and ‘dplyr’.

-

Usage:

-

kruskal.pretty(x, metrics, csv, c(‘interesting’), ‘output_name’, taxonomy)

-

Inputs:

-
    -
  • x: OTU table
  • -
  • metrics: metadata table
  • -
  • csv: output from simper.pretty, must be imported as data.frame. e.g. csv= data.frame(read.csv(“PATH to name_clean_simper.csv”))
  • -
  • interesting: a list of the column headers for the columns of interest in the metrics file, should be same as simper.pretty inputs. e.g. c(‘int1’,‘int2’,‘int3’)
  • -
  • output_name= the name that is appended to the output filename “_krusk_simper.csv“.
  • -
  • taxonomy: The .taxonomy file output from classify.otu command in mothur. This is the UNALTERED tax file, not tax.clean (optional)
  • -
-

First, we load these functions into R.

-
source("Steinberger_scripts/simper_pretty.r")
-source("Steinberger_scripts/R_krusk.r")
-

Then, we apply them to our data. We will ask for all SIMPER OTUs (perc_cutoff = 1, meaning up to cumulative 100%) but cutoff any OTUs that individually contribute less than 1% to SIMPER (low_val=0.01). You may want to consider different cutoffs for your data.

-
simper.pretty(OTU.clean, meta, c('AgeGroup'), perc_cutoff=1, low_cutoff = 'y', low_val=0.01, 'Age')
-
-simper.results = data.frame(read.csv("Age_clean_simper.csv"))
-kruskal.pretty(OTU.clean, meta, simper.results, c('AgeGroup'), 'Age', tax)
-

If we import the Kruskal-Wallis back into R and select only OTUs there were significantly different after fdr correction (fdr_krusk_p.val)…

-
#Import
-KW.results = data.frame(read.csv("Age_krusk_simper.csv"))
-#Remove non-significant
-KW.results.signif = KW.results[KW.results$fdr_krusk_p.val < 0.05,]
-#Order by OTU#
-KW.results.signif = KW.results.signif[with(KW.results.signif, order(OTU)),]
-head(KW.results.signif)
-
##     X Comparison     SIMPER      OTU  krusk_p.val fdr_krusk_p.val
-## 2   2     1yr_2w 0.06434298 Otu00001 0.0004510953     0.001383359
-## 15 15     1yr_8w 0.03765603 Otu00001 0.0004510953     0.001383359
-## 1   1     1yr_2w 0.09837610 Otu00002 0.0004510953     0.001383359
-## 30 30      2w_8w 0.11013903 Otu00002 0.0208625823     0.029989962
-## 3   3     1yr_2w 0.05981442 Otu00003 0.0003310658     0.001383359
-## 32 32      2w_8w 0.06626526 Otu00003 0.0356919001     0.044373714
-##                                                                                                                   Taxonomy
-## 2          k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 15         k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 1          k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 30         k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 3  k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;
-## 32 k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;
-##    Left.mean.abund   Left.stdev Right.mean.abund Right.stdev
-## 2     7.109140e-06 2.010768e-05      0.128370197  0.16351829
-## 15    7.109140e-06 2.010768e-05      0.073292635  0.09803742
-## 1     7.118451e-06 2.013402e-05      0.196185324  0.23796423
-## 30    1.961853e-01 2.379642e-01      0.007205221  0.01601067
-## 3     0.000000e+00 0.000000e+00      0.119333403  0.18000346
-## 32    1.193334e-01 1.800035e-01      0.010598818  0.02126522
-

we see a number of OTU that significantly differ by age group.

-

Looking at OTU1 as relative abundance

-
#Calculate abundance
-abund = OTU.clean/rowSums(OTU.clean)*100
-#plot
-boxplot(abund$Otu00001 ~ meta$AgeGroup.ord, ylab="% Relative abundance", main="OTU1")
-

-

and using the P-values in KW.results.signif, we can say that OTU1 is significantly less abundant in 1yr animals compared to either 2w or 8w calves.

-
-
-

Continuous variables

-

For continuous variables, there is no simple test like SIMPER to pull out OTUs likely to differ across your variable. You could run linear models glm of the OTU abundances with different distributions family= similar to what we did with Chao richness. However, OTU abundance data is not normal nor does it fit well with other standard distributions due to its many zeros. So, you will need to test a number of distributions and transformations of the data to find a suitable model.

-
-
-

Correlations

-

So, you can also approach continuous variables as correlations. Generally, only strong correlations (r > 0.5 or r < -0.5) should be reported and if you have a lot that fall into the “strong” category, you can up the cut off, say, to r > 0.75 or r < -0.75. There are many correlation options. I like Kendall-Tau because it does not assume linearity or normality. Type ??cor in the R console to learn others that are available.

-

Also, consider options to decrease the number of OTUs tested or you will be dealing with a huge table. Like only ones at >X% abundance? Only ones found in SIMPER and/or KW analyses of other important variables?

-

Here, we will correlate ADG to OTUs with at least 5% relative abundance in at least one sample in our data set.

-
#Remember we calculated abundance before with
-#abund = OTU.clean/rowSums(OTU.clean)*100
-
-#Subset OTUs to abundance cutoff
-OTU.abund = OTU.clean[, apply(abund, MARGIN=2, function(x) any(x > 5))]
-
-cor.kendall = cor(OTU.abund, meta$ADGKG, method = "kendall")
-cor.kendall
-
##                  [,1]
-## Otu00001  0.189852125
-## Otu00002  0.211764129
-## Otu00003  0.027397313
-## Otu00004  0.275867615
-## Otu00005  0.165056323
-## Otu00006 -0.114462240
-## Otu00007  0.143930930
-## Otu00008  0.211764129
-## Otu00009 -0.177517901
-## Otu00010  0.176299258
-## Otu00011  0.208334326
-## Otu00012  0.017236256
-## Otu00013  0.269669049
-## Otu00015  0.018077538
-## Otu00016 -0.257293680
-## Otu00017  0.284293111
-## Otu00019  0.172479145
-## Otu00020  0.102188122
-## Otu00022 -0.034040152
-## Otu00023  0.004106646
-## Otu00024  0.073416202
-## Otu00027  0.412640807
-## Otu00029  0.076924424
-## Otu00030 -0.077670805
-## Otu00031  0.286002668
-## Otu00038 -0.271163072
-## Otu00041  0.125193349
-## Otu00043  0.189645652
-## Otu00044  0.239065695
-## Otu00053 -0.217652255
-## Otu00055 -0.112428004
-## Otu00070 -0.037317590
-

In this case, we don’t see any strong correlations. However, if we did, we could use those OTUs as our list of ones that are of interest to check for significance with glm.

-

Next, we will correlate SCFAs with OTUs with at least 1% relative abundance in at least one sample in our data set. We will use only samples for which we also have SCFA data.

-
#Calculate abundances
-abund.SCFA = OTU.SCFA/rowSums(OTU.SCFA)*100
-
-#Subset OTUs to abundance cutoff
-OTU.SCFA.abund = OTU.SCFA[, apply(abund.SCFA, MARGIN=2, function(x) any(x > 1))]
-
-cor.kendall = cor(OTU.SCFA.abund, SCFA, method = "kendall")
-cor.kendall
-
##             Formate    Acetate Propionate Isobutyrate   Butyrate
-## Otu00006  0.0000000  0.1825742  0.1825742   0.1825742  0.1825742
-## Otu00014  0.1825742  0.3333333  0.3333333   0.0000000  0.3333333
-## Otu00016 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00018 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00021 -0.9128709 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00025  0.9128709  0.6666667  0.6666667   0.3333333  0.6666667
-## Otu00035 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00036 -0.5477226 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00037 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00040 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00042  0.1825742  0.3333333  0.3333333   0.0000000  0.3333333
-## Otu00046 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00049 -0.1825742 -0.3333333 -0.3333333   0.0000000 -0.3333333
-## Otu00051  0.5477226  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00052 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00056 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00064 -0.5477226 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00066 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00067  0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00069  0.5477226  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00074  0.5477226  0.6666667  0.6666667   0.3333333  0.6666667
-## Otu00077  0.1825742  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00088  0.1825742  0.0000000  0.0000000  -0.3333333  0.0000000
-## Otu00089  0.1825742  0.0000000  0.0000000  -0.3333333  0.0000000
-## Otu00097 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00100 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00113 -0.5477226 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00192  0.5477226  0.6666667  0.6666667   1.0000000  0.6666667
-## Otu00295  0.2581989  0.2357023  0.2357023   0.7071068  0.2357023
-##            iVal.2MB   Valerate
-## Otu00006 -0.1825742  0.1825742
-## Otu00014 -0.3333333  0.0000000
-## Otu00016 -0.3333333 -0.6666667
-## Otu00018 -0.3333333 -0.6666667
-## Otu00021 -0.6666667 -0.3333333
-## Otu00025  0.6666667  0.3333333
-## Otu00035 -0.6666667 -1.0000000
-## Otu00036  0.0000000 -0.3333333
-## Otu00037  0.0000000  0.3333333
-## Otu00040 -0.6666667 -1.0000000
-## Otu00042 -0.3333333  0.0000000
-## Otu00046 -0.3333333 -0.6666667
-## Otu00049  0.3333333  0.0000000
-## Otu00051  1.0000000  0.6666667
-## Otu00052 -0.6666667 -1.0000000
-## Otu00056 -0.3333333 -0.6666667
-## Otu00064 -1.0000000 -0.6666667
-## Otu00066 -0.6666667 -1.0000000
-## Otu00067  0.6666667  0.3333333
-## Otu00069  1.0000000  0.6666667
-## Otu00074  0.0000000  0.3333333
-## Otu00077  0.3333333  0.6666667
-## Otu00088  0.0000000 -0.3333333
-## Otu00089  0.0000000 -0.3333333
-## Otu00097  0.0000000  0.3333333
-## Otu00100  0.0000000  0.3333333
-## Otu00113  0.0000000 -0.3333333
-## Otu00192  0.6666667  1.0000000
-## Otu00295  0.7071068  0.7071068
-

If the data table is too large to view in R, you can write it to a table in your project folder.

-
write.table(cor.kendall, file = "cor_kendall.csv", sep = ",")
-

We see that some OTUs strongly correlation with a SCFAs. For example, Otu00021 and Otu00025 with Formate

-
par(mfrow = c(1, 2))
-plot(abund.SCFA$Otu00021 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU21")
-plot(abund.SCFA$Otu00025 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU25")
-

-

Clearly we don’t have enough data points to make strong conclusions here and the correlations are being driven by one animal with very high formate. However, we could further test the list of OTUs that correlate strongly with SCFAs. We will assume a normal distribution here, but you should assess your models with plot() to make sure they are a good fit.

-
OTU21.Formate = glm(OTU.SCFA$Otu00021 ~ SCFA$Formate)
-summary(OTU21.Formate)
-
## 
-## Call:
-## glm(formula = OTU.SCFA$Otu00021 ~ SCFA$Formate)
-## 
-## Deviance Residuals: 
-##       1        2        3        4  
-## -56.173   96.253  -46.747    6.668  
-## 
-## Coefficients:
-##              Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)    357.75      51.46   6.952   0.0201 *
-## SCFA$Formate  -540.02     201.13  -2.685   0.1152  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 7324.907)
-## 
-##     Null deviance: 67454  on 3  degrees of freedom
-## Residual deviance: 14650  on 2  degrees of freedom
-## AIC: 50.175
-## 
-## Number of Fisher Scoring iterations: 2
-
OTU25.Formate = glm(OTU.SCFA$Otu00025 ~ SCFA$Formate)
-summary(OTU25.Formate)
-
## 
-## Call:
-## glm(formula = OTU.SCFA$Otu00025 ~ SCFA$Formate)
-## 
-## Deviance Residuals: 
-##        1         2         3         4  
-##  127.727  -118.783     6.217   -15.162  
-## 
-## Coefficients:
-##              Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)    219.78      74.49   2.951   0.0982 .
-## SCFA$Formate   721.00     291.12   2.477   0.1316  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 15346.04)
-## 
-##     Null deviance: 124819  on 3  degrees of freedom
-## Residual deviance:  30692  on 2  degrees of freedom
-## AIC: 53.133
-## 
-## Number of Fisher Scoring iterations: 2
-

So, we see that these two OTUs do not significantly differ with Formate concentration even though they had very strong Kendall correlations. This is similar to OTUs occuring in SIMPER that do not hold up to subsequent Kruskal-Wallis testing.

-
-
-
-

Other visualizations

-
-

Bar charts

-

The phyloseq object we created with our OTU, meta, tax, and tree data (physeq.tree) can also be used in a number of other plot functions in the phyloseq / ggplot2 packages.

-

Let’s explore some of the bar chart options. First, we’ll make the classic additive bar chart for phyla in our samples

-
plot_bar(physeq.tree, fill="Phylum")
-

-

We can simplify by grouping our samples by age group

-
plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") 
-

-

And removing the lines between OTUs in the bars

-
plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

And only showing the top 5 most abundant phyla

-
#Sort the Phyla by abundance and pick the top 5
-top5P.names = sort(tapply(taxa_sums(physeq.tree), tax_table(physeq.tree)[, "Phylum"], sum), TRUE)[1:5]
-#Cut down the physeq.tree data to only the top 10 Phyla
-top5P = subset_taxa(physeq.tree, Phylum %in% names(top5P.names))
-#Plot
-plot_bar(top5P, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

There are many more options within ggplot2 to alter this figure. This document has many helpful tips.

-

Another way to simplify these bar plots is to not show all OTUs for one sample in one bar. We can do this with facet_grid

-
plot_bar(top5P, x="AgeGroup", fill="Phylum", facet_grid = ~Phylum) + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

And you can break it down at any taxonomic level and color by any other level.

-
-
-

Trees

-

We can also plot phylogenetic trees and label/modify them by our variables of interest.

-

Let’s look at the genus Prevotella in our data. We want to subset down to just this genus or else our plot would be too cluttered to read.

-

Subset by genus

-
prevotella = subset_taxa(physeq.tree, Genus == "g__Prevotella")
-

We can see that this worked by comparing the number of taxa in our subset and our original data

-
physeq.tree
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 5002 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 5002 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 5002 tips and 5000 internal nodes ]
-
prevotella
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 106 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 106 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 106 tips and 105 internal nodes ]
-

We can plot these OTUs on a tree.

-
plot_tree(prevotella, plot.margin = 0.5, ladderize = TRUE)
-

-

In the figure, each OTU is represented by the end branch of the tree. How many samples that OTU occurs in is represented by the black dots.

-

Let’s make this figure a little more useful and add 1) Colors to the dots for our age groups, 2) Size to the dots to show OTU abundance, and 3) Species level labels for the OTUs

-
plot_tree(prevotella, color = "AgeGroup", label.tips = "Species", size = "abundance", plot.margin = 0.5, ladderize = TRUE)
-

-

Already it’s a little difficult to read. You can view a larger page by clicking “Zoom” above the figure. Or export the figure as a PDF and save as a full page size, 9.5x11.

-

There are even more customizable options in this figure. Type ?plot_tree into the console to see the help page explaining all the options.

-
-
-

Heat maps

-

There are some good options in both phyloseq and gplots to make heatmaps. We will go through phyloseq but know that the same things could be done in gplots with code specific to that package.

-
-

OTUs

-

We’re going to just look at the 20 most abundant OTUs to make it more readable.

-
#Sort the OTUs by abundance and pick the top 20
-top20OTU.names = names(sort(taxa_sums(physeq.tree), TRUE)[1:20])
-#Cut down the physeq.tree data to only the top 10 Phyla
-top20OTU = prune_taxa(top20OTU.names, physeq.tree)
-

We now see that we only have 20 taxa

-
top20OTU
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 20 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 20 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 20 tips and 19 internal nodes ]
-

First, you can make a heatmap of OTU abundance across all samples

-
plot_heatmap(top20OTU)
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

And grouped by our age groups

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

We can label the OTU taxa

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

And group OTUs within the same Phyla

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

We can also change the colors (white -> purple), including the 0s/NAs (grey).

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum", low="white", high="purple", na.value="grey")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

You can also have R automatically group your OTUs and samples by beta-diversity. This may yield the most easily interpreted heatmap but if you have a specific research question that is better addressed by your own ordering (like our age groups above), you should stick with that. We’ll show Bray-Curtis as an example. Other options are

-
    -
  • bray
  • -
  • jaccard
  • -
  • wunifrac
  • -
  • uwunifrac
  • -
-
plot_heatmap(top20OTU, "NMDS", "bray", title="Bray-Curtis")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-
-
-

Beta-diversity

-

The other common use for heatmaps is to show distances between samples (i.e. beta-diversity) similar to what is shown in nMDS. We have all of the same metric options as we did for nMDS.

-

We do not want to use the plot_heatmap() function from phyloseq because it requires the input of a physeq object. Instead, we can use our distance matrices as inputs for a gplots command. This command will automatically group samples by similarity (trees)

-
#Bray-Curtis
-heatmap.2(as.matrix(BC.dist))
-

-
#UniFrac
-heatmap.2(as.matrix(wUF.dist))
-

-

You could also change the colors

-
#Rainbow colors
-rc <- rainbow(nrow(as.matrix(BC.dist)), start=0, end=0.9)
-heatmap.2(as.matrix(BC.dist), col=rc)
-

-

As always, for further customization, explore with ?heatmap.2

-
-
-
-

Venn diagrams

-

Venn diagram of three samples: 5017.2w.F, 5017.8w.F, and 5017.1yr.F

-

Create a list of OTUs that occur (count > 0) in each sample.

-
    -
  • We select for the row by name with OTU.clean[“name”,]
  • -
  • We select the columns with a value >0 with OTU.clean[,apply()]
  • -
-
OTU.5017.2w = colnames(OTU.clean["5017.2w.F", apply(OTU.clean["5017.2w.F",], MARGIN=2, function(x) any(x >0))])
-
-OTU.5017.8w = colnames(OTU.clean["5017.8w.F", apply(OTU.clean["5017.8w.F",], MARGIN=2, function(x) any(x >0))])
-
-OTU.5017.1yr = colnames(OTU.clean["5017.1yr.F",apply(OTU.clean["5017.1yr.F",], MARGIN=2, function(x) any(x >0))])
-

We can then use these lists of OTUs to plot a Venn diagram with venn() from the gplots package

-
venn(list(OTU.5017.2w, OTU.5017.8w, OTU.5017.1yr))
-

-

We can also do this for our age groups by selecting all samples where meta$AgeGroup = 2w, 8w, or 1yr

-
OTU.2w = colnames(OTU.clean[meta$AgeGroup == "2w", apply(OTU.clean[meta$AgeGroup == "2w",], MARGIN=2, function(x) any(x >0))])
-
-OTU.8w = colnames(OTU.clean[meta$AgeGroup == "8w", apply(OTU.clean[meta$AgeGroup == "8w",], MARGIN=2, function(x) any(x >0))])
-
-OTU.1yr = colnames(OTU.clean[meta$AgeGroup == "1yr", apply(OTU.clean[meta$AgeGroup == "1yr",], MARGIN=2, function(x) any(x >0))])
-

And plot

-
venn(list(OTU.2w, OTU.8w, OTU.1yr))
-

-

These are not the prettiest Venns, but they are the quickest way to calculate the values within a Venn.

-

Once you have these, you can use the VennDiagram package for more pretty graphing options. For example, the age groups venns would be

-
draw.triple.venn(area1 = 385+58+71+320, area2 = 801+190+320+71, area3 = 3177+190+58+71, n12 = 320+71, n23 = 190+71, n13 = 58+71, n123 = 71, category = c("2w", "8w", "1yr"), lty = "blank", fill = c("green", "red", "blue"))
-

-
## (polygon[GRID.polygon.1343], polygon[GRID.polygon.1344], polygon[GRID.polygon.1345], polygon[GRID.polygon.1346], polygon[GRID.polygon.1347], polygon[GRID.polygon.1348], text[GRID.text.1349], text[GRID.text.1350], text[GRID.text.1351], text[GRID.text.1352], text[GRID.text.1353], text[GRID.text.1354], text[GRID.text.1355], text[GRID.text.1356], text[GRID.text.1357], text[GRID.text.1358])
-

Or with venneuler, you can scale the circles to be porportional to the total number of OTUs in that group

-
#Create a venneuler object
-age.venn=venneuler(c('A' = 385+58+71+320, 'B' = 801+190+320+71, 'C' = 3177+190+58+71, 'A&B' = 320+71, 'B&C' = 190+71, 'A&C' = 58+71, 'A&B&C' = 71))
-
-#Add group names
-age.venn$labels = c("2w", "8w", "1yr")
-
-#Plot
-plot(age.venn)
-

-

Or we can export the OTU lists and make Venns with this online tool http://bioinformatics.psb.ugent.be/webtools/Venn/. This tool is handy in that is gives you the list of OTUs within the Venn sections so that you can see which specific bacteria are shared.

-
write.table(OTU.2w, "OTU.2w.csv", sep=",", row.names=FALSE, col.names=FALSE)
-write.table(OTU.8w, "OTU.8w.csv", sep=",", row.names=FALSE, col.names=FALSE)
-write.table(OTU.1yr, "OTU.1yr.csv", sep=",", row.names=FALSE, col.names=FALSE)
-
-
-

Networks

-
-

OTUs

-

You can plot the distances between OTUs as a network. It would be an unreadable mess to plot all the OTUs in our data set, so we will just use the smaller prevotella data set.

-
plot_net(prevotella, color="Species", type="taxa")
-

-

For co-occurrence networks of OTUs, I recommend Gephi or Cytoscape. Thus far, I have not found an R package comparable to these other programs.

-
-
-

Beta-diversity

-

You can also plot beta-diversity as a network where the edges (lines) are the distances between samples. All metrics we’ve used here are supported (bray, jaccard, wunifrac, uwunifrac)

-
plot_net(physeq.tree, color="AgeGroup", distance="bray")
-

-
-
-
-
-

Publication figures

-

Once you have a figure you want to include in a publication, there are a number of ways to export it out of R. You can use the “Export” function within the Plots window, but this often does not result in high enough resolution.

-

Ideally, you want to save in PostScript (.ps) or PDF (.pdf) formats because they are vector-based, meaning they are not any specific dpi and do not get blurry when zoomed in. Other formats (PNG, JPG, BMP, TIFF) are pixel-based formats (little square dots) and can become jagged when zoomed in.

-

If you have issues getting a specific font to work, try installing and loading the package extrafont.

-
-

PostScript

-

Here, we will use postscript to export as a .ps. This function uses

-
    -
  • width, height: in inches unless otherwise specified with units=
  • -
  • horizontal: TRUE = landscape, FALSE = portrait
  • -
  • colormodel: RGB, CMYK, and others
  • -
  • family: Font to be used within figures
  • -
-

Then we add layout if we have more than one plot within the overall figure.

-
    -
  • matrix: -
      -
    • A list of how many figures there are. For 2, it is c(1,2). For 4, it is c(1,2,3,4)
    • -
    • Then the number of rows, columns the figures should be oriented in
    • -
  • -
  • widths: A list of scalars of how large each figure should be in width.
  • -
  • heights: A list of scalars of how large each figure should be in height.
  • -
-
postscript("Fig1.ps", width = 7, height = 3, horizontal = FALSE, colormodel = "rgb", family = "ArialMT")
- 
-layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1))
- 
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
- 
-boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue"))
- 
-dev.off()
-
## png 
-##   2
-

To open the resulting .ps file:

-
    -
  • Open it directly in Adobe Illustrator (vectors are preserved)
  • -
  • On a Mac, double-clicking on it will convert it automatically into a PDF and will open automatically into Preview.
  • -
  • On Windows, it depends on how “file associations” are set-up. Typically the file would need some transformation on a “standard” Windows computer before it can be used. If Adobe software is installed, it could run via Distiller to convert the .ps to a PDF.
  • -
-
-
-

PDF

-

To export directly to a PDF, we will use pdf

-
pdf("Fig1.pdf", width = 7, height = 3,  colormodel = "rgb", family = "ArialMT")
- 
-layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1))
- 
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
- 
-boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue"))
- 
-dev.off()
-
## png 
-##   2
-
-
-

PNG

-

PNG is pixel-based so it may get blurry if not at high enough resolution. The exact resolution can be specified by giving the dpi in res=

-
png("Fig1.png", width = 7, height = 3, units='in', res=300)
- 
-layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1))
- 
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
- 
-boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue"))
- 
-dev.off()
-
## png 
-##   2
-
- - -
-
-
- - - -
-
- -
- - - - - - - - diff --git a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R_update.html b/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R_update.html deleted file mode 100755 index b202f13..0000000 --- a/Microbiota_analysis_R/Rpubs/Microbiota_Analysis_in_R_update.html +++ /dev/null @@ -1,2445 +0,0 @@ - - - - - - - - - - - - - - - -Microbiota Analysis in R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - -
-
-
-
-
- -
- - - - - - - -

Updated April 5, 2017

-

Online version available at http://rpubs.com/dillmcfarlan/R_microbiotaSOP

-
-

Tips for this workshop

-
    -
  1. If you have any issues in R, type ??command into the console where “command” is the function you are having issues with and a help page will come up.
  2. -
  3. Lines starting with # are comments that are for the reader’s benefit. These lines are not code and do not need to be entered into the console.
  4. -
  5. GREY boxes contain code that you can copy and paste to run on your machine.
  6. -
-
#GREY box
-
    -
  1. WHITE boxes contain sample output of this code, and nothing will happen if you try to copy it into your console.

    -
    WHITE box
  2. -
  3. Basic R code you may find useful: -
      -
    1. Matrices/data frames are designated by [ , ] where it is [rows, columns]
    2. -
    3. | is or
    4. -
    5. & is and
    6. -
  4. -
-
-
-

Introduction

-

Written for R v3.3.2 in RStudio v1.0.136

-
-

Goal

-

The goal of this tutorial is to demonstrate basic analyses of microbiota data to determine if and how communities differ by variables of interest. In general, this pipeline can be used for any microbiota data set that has been clustered into operational taxonomic units (OTUs).

-

This tutorial assumes some basic statistical knowledge. Please consider if your data fit the assumptions of each test (normality? equal sampling? Etc.). If you are not familiar with statistics at this level, we strongly recommend collaborating with someone who is. The incorrect use of statistics is a pervasive and serious problem in the sciences so don’t become part of the problem! That said, this is an introductory tutorial and there are many, many further analyses that can be done with microbiota data. Hopefully, this is just the start for your data!

-
-
-

Data

-

The data used here were created using 2x250 bp amplicon sequencing of the bacterial V4 region of the 16S rRNA gene on the Illumina MiSeq platform. The full data set is in Dill-McFarland et al. Sci Rep 7: 40864. Here, we will use a subset of samples. Specifically, we will be correlating the fecal bacterial microbiota of 8 dairy calves at different ages (2 weeks, 8 weeks, 1 year) to variables like weight gain (average daily gain in kg, ADGKG) and gastrointestinal short chain fatty acids (SCFA).

-
-
-

Files

-

We will use the following files created using the Microbiota Processing in mothur: Standard Operating Procedure (SOP).

-
    -
  • example.final.nn.unique_list.0.03.norm.shared (OTU table)
  • -
  • example.final.nn.unique_list.0.03.cons.taxonomy (Taxonomy of OTUs)
  • -
-

We will also be using tab-delimited metadata and SCFA files created in Excel. The metadata includes our metadata (like age and ADGKG) as well as alpha-diversity metrics from example.final.nn.unique_list.0.03.norm.groups.summary calculated in mothur. The SCFA table is the mM concentrations of different SCFAs in rumen (stomach) liquids from 1-year-old animals.

-
    -
  • example.metadata.txt
  • -
  • example.SCFA.txt
  • -
-

Finally, we will be loading a number of custom scripts from Steinberger_scripts and some a pre-calculated OTU tree NJ.tree.RData. The information for creating this tree is provided in this tutorial.

-
-
-
-

Get set up

-
-

Download and install

-
    -
  • Base R: http://cran.mtu.edu/
  • -
  • RStudio: https://www.rstudio.com/products/rstudio/download3/
  • -
  • Packages: Open RStudio on your computer. If you have not already downloaded these packages, go to the lower right quadrant of your screen and open the Package tab. Click “download” and search for the package you want to download. -
      -
    • tidyr
    • -
    • dplyr
    • -
    • vegan
    • -
    • ape
    • -
    • ggplot2
    • -
    • gplots
    • -
    • plotly
    • -
    • phangorn
    • -
    • VennDiagram
    • -
    • venneuler
    • -
    • phyloseq (phyloseq is not on CRAN, so we have to call it manually. See below.)
    • -
  • -
-

Copy and paste the following into your console.

-
source("https://bioconductor.org/biocLite.R")
-
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
-
biocLite("phyloseq")
-
## BioC_mirror: https://bioconductor.org
-
## Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.3 (2017-03-06).
-
## Installing package(s) 'phyloseq'
-
## package 'phyloseq' successfully unpacked and MD5 sums checked
-## 
-## The downloaded binary packages are in
-##  C:\Users\Kim\AppData\Local\Temp\RtmpA7CLxP\downloaded_packages
-
## installation path not writeable, unable to update packages: cluster,
-##   lattice, survival
-
## Old packages: 'Biostrings', 'IRanges', 'S4Vectors', 'XVector'
-

Note: If you are having trouble installing packages, turn off your computer’s firewall temporarily.

-
-
-

Organization

-

All of our analyses will be organized into a “Project”.

-

Make a new project by selecting File->New project. Select “New Directory” and “Empty Project”. Name the project “Microbiota_Analysis_BRC” and save the project to your Desktop. Place all of your files for this analysis in the folder created on the Desktop

-

Create a new R script (File->New file->R script) to save your code. This file will automatically be saved in the project folder.

-

Now your screen should look like this

-
    -
  • Upper left: Where you type and save the code you want to run.
  • -
  • Upper right: Files you load into and create in R. To view one, click on it and it will open in the upper left pane.
  • -
  • Lower left: The console. Where commands and outputs run (similar to the one mothur window).
  • -
  • Lower right: Variable. Explore the different tabs.
  • -
-
-
-
-

Data manipulation

-
-

Load Packages

-

The “library” command tells R to open the package you want to use. You need to do this every time you open R.

-
#This package will help us more easily manipulate our data, which are matrices
-library(tidyr)
-
-#This package will also help us more easily manipulate our data
-library(dplyr)
-
## 
-## Attaching package: 'dplyr'
-
## The following objects are masked from 'package:stats':
-## 
-##     filter, lag
-
## The following objects are masked from 'package:base':
-## 
-##     intersect, setdiff, setequal, union
-
#The vegan package provides tools for descriptive community ecology. It has most basic functions of diversity analysis, community ordination and dissimilarity analysis. In general, this package is used for Bray-Curtis and Jaccard analyses.
-library(vegan)
-
## Loading required package: permute
-
## Loading required package: lattice
-
## This is vegan 2.4-3
-
#The phyloseq package seeks to address issues with multiple microbiome analysis packages by providing a set of functions that internally manage the organizing, linking, storing, and analyzing of phylogenetic sequencing data. In general, this package is used for UniFrac analyses.
-library(phyloseq)
-
-#Analyses of Phylogenetics and Evolution package. Required for tree calculations to be used with phyloseq
-library(ape)
-
-#Graphing package used in phyloseq. To edit the default setting of a plot, you need to use functions in this package.
-library(ggplot2)
-
-#This package is used to calculate and plot Venn diagrams as well as heatmaps
-library(gplots)
-
## 
-## Attaching package: 'gplots'
-
## The following object is masked from 'package:stats':
-## 
-##     lowess
-
#A package to create interactive web graphics of use in 3D plots
-library(plotly)
-
## 
-## Attaching package: 'plotly'
-
## The following object is masked from 'package:ggplot2':
-## 
-##     last_plot
-
## The following object is masked from 'package:stats':
-## 
-##     filter
-
## The following object is masked from 'package:graphics':
-## 
-##     layout
-
#used to read in mothur-formatted files
-library(phangorn)
-
## 
-## Attaching package: 'phangorn'
-
## The following objects are masked from 'package:vegan':
-## 
-##     diversity, treedist
-
#Pretty Venn disgrams
-library(VennDiagram)
-
## Loading required package: grid
-
## Loading required package: futile.logger
-
## 
-## Attaching package: 'VennDiagram'
-
## The following object is masked from 'package:ape':
-## 
-##     rotate
-
library(venneuler)
-
## Loading required package: rJava
-
-
-

Load Data

-

In the code, the text before = is what the file will be called in R. Make this short but unique as this is how you will tell R to use this file in later commands.

-
    -
  • header: tells R that the first row is column names, not data
  • -
  • row.names: tells R that the first column is row names, not data
  • -
  • sep: tells R that the data are tab-delimited. If you had a comma-delimited file, you would us sep=","
  • -
-
#OTU table (shared file)
-OTU = read.table("example.final.an.unique_list.0.03.norm.shared", header=TRUE, sep="\t")
-
-#Taxonomy of each OTU
-tax = read.table("example.final.an.unique_list.0.03.cons.taxonomy", header=TRUE, sep="\t")
-
-#Metadata. Since we made this in Excel, not mothur, we can use the "row.names" modifier to automatically name the rows by the values in the first column (sample names)
-meta = read.table("example.metadata.txt", header=TRUE, row.names=1, sep="\t")
-
-#SCFA data
-SCFA = read.table("example.SCFA.txt", header=TRUE, row.names=1, sep="\t")
-
-
-

Clean up the data

-

You can look at your data by clicking on it in the upper-right quadrant “Environment”

-

There are several unneeded columns and incorrect formatting in the tables as they were output by mothur. We will now fix them.

-
-

OTU table

-

We need to use the “Group” column as the row names so that it will match our metadata

-
row.names(OTU) = OTU$Group
-

We then need to remove the “label”, “numOTUs”, and “Group” columns as they are not OTU counts like the rest of the table

-
OTU.clean = OTU[,-which(names(OTU) %in% c("label", "numOtus", "Group"))]
-
-
-

Taxonomy table

-

For the taxonomy table, we name the rows by the OTU #

-
row.names(tax) = tax$OTU
-

Remove all the OTUs that don’t occur in our OTU.clean data set

-
tax.clean = tax[row.names(tax) %in% colnames(OTU.clean),]
-

We then need to separate the “taxonomy” column so that each level (i.e. Domain, Phylum, etc) is in it’s own column. We do this with a special command “separate” from the tidyr package

-
tax.clean = separate(tax.clean, Taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain"), sep=";")
-

Finally, we remove the “Size” and “Strain” columns as well as “OTU” since these are now the row names

-
tax.clean = tax.clean[,-which(names(tax.clean) %in% c("Size", "Strain", "OTU"))]
-
-
-

Metadata and SCFA tables

-

These tables do not require any modification since I created them in Excel exactly as I need them for this R analysis.

-
-
-
-

Order the data

-

To make viewing and using the data easier, we will make sure our tables have samples (rows) in the same order. Since OTU.clean, meta, and SCFA have sample names as row names, we order by these.

-
OTU.clean = OTU.clean[order(row.names(OTU.clean)),]
-meta = meta[order(row.names(meta)),]
-SCFA = SCFA[order(row.names(SCFA)),]
-

Our taxonomy table is already in order from OTU1 to OTUN so we do not need to order it.

-
-
-

Set seed

-

We will be running some processes that rely on the random number generater. To make your analysis reproducible, we set the random seed.

-
set.seed(8765)
-
-
-
-

Alpha-diversity

-

Alpha-diversity is within sample diversity. It is how many different species (OTUs) are in each sample (richness) and how evenly they are distributed (evenness), which together are diversity. Each sample has one value for each metric.

-

This image illustrates richness vs. diversity. Both forests have the same richness (4 tree species) but Community 1 has much more even distribution of the 4 species while Community 2 is dominated by tree species A. This makes Community 1 more diverse than Community 2.

-
-

Explore alpha metrics

-

Now we will start to look at our data. We will first start with alpha-diversity and richness. Let’s plot some common ones here.

-
#Create 2x2 plot environment so that we can see all 4 metrics at once. 
-par(mfrow = c(2, 2))
-
-#Then plot each metric.
-hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10)
-hist(meta$simpson, main="Simpson diversity", xlab="", breaks=10)
-hist(meta$chao, main="Chao richness", xlab="", breaks=15)
-hist(meta$ace, main="ACE richness", xlab="", breaks=15)
-

-

You want the data to be roughly normal so that you can run ANOVA or t-tests. If it is not normally distributed, you will need to consider non-parametric tests such as Kruskal-Wallis.

-

Here, we see that none of the data are normally distributed. This occurs with the subset but not the full data set because I’ve specifically selected samples with divergent alpha metrics. In general, you will see roughly normal data for Shannon’s diversity as well as most richness metrics. Simpson’s diversity, on the other hand, is usually skewed as seen here.

-

So most will use inverse Simpson (1/Simpson) instead. This not only increases normalcy but also makes the output more logical as a higher inverse Simpson value corresponds to higher diversity.

-

Let’s look at inverse Simpson instead.

-
#Create 2x2 plot environment 
-par(mfrow = c(2, 2))
-
-#Plots
-hist(meta$shannon, main="Shannon diversity", xlab="", breaks=10)
-hist(1/meta$simpson, main="Inverse Simpson diversity", xlab="", breaks=10)
-hist(meta$chao, main="Chao richness", xlab="", breaks=15)
-hist(meta$ace, main="ACE richness", xlab="", breaks=15)
-

-

Now we see a bimodal distribution for Simpson similar to the richness metrics.

-

To test for normalcy statistically, we can run the Shapiro-Wilk test of normality.

-
shapiro.test(meta$shannon)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$shannon
-## W = 0.91511, p-value = 0.0456
-
shapiro.test(1/meta$simpson)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  1/meta$simpson
-## W = 0.74821, p-value = 4.69e-05
-
shapiro.test(meta$chao)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$chao
-## W = 0.80636, p-value = 0.0003749
-
shapiro.test(meta$ace)
-
## 
-##  Shapiro-Wilk normality test
-## 
-## data:  meta$ace
-## W = 0.83017, p-value = 0.0009573
-

We see that, as expected from the graphs, none are normal.

-

However, our sample size is small and normalcy tests are very sensitive for small data-sets. In fact, you can run Shapiro-Wilk on a list of 50 values randomly sampled from the R-generated normal distribution and find that they are not normal (even though we know that they are!)

-

So, what does this mean for our purposes? Well, we should run statistical tests that don’t assume our data is normal, because we don’t have any evidence (graphs, Shapiro-Wilk) that it is normal. For demonstration purposes, though, we will run other tests as well.

-

Overall, for alpha-diversity:

-
    -
  • ANOVA, t-test, or general linear models with the normal distribution are used when the data is roughly normal
  • -
  • Kruskal-Wallis, Wilcoxon rank sum test, or general linear models with another distribution are used when the data is not normal
  • -
-

Our main variables of interest are

-
    -
  • AgeGroup: 2w, 8w, 1yr
  • -
  • ADGKG: 0.05-1.56 kg gained per day (average daily gain kg)
  • -
-
-
-

Categorical variables

-

Now that we know which tests can be used, let’s run them.

-

Normally distributed metrics

-

Since it’s the closest to normalcy, we will use Shannon’s diversity as an example. First, we will test age, which is a categorical variable with more than 2 levels. Thus, we run ANOVA. If age were only two levels, we could run a t-test

-

Does age impact the Shannon diversity of the fecal microbiota?

-
#Run the ANOVA and save it as an object
-aov.shannon.age = aov(shannon ~ AgeGroup, data=meta)
-#Call for the summary of that ANOVA, which will include P-values
-summary(aov.shannon.age)
-
##             Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup     2  42.98  21.489   103.4 1.35e-11 ***
-## Residuals   21   4.36   0.208                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

To do all the pairwise comparisons between groups and correct for multiple comparisons, we run Tukey’s honest significance test of our ANOVA.

-
TukeyHSD(aov.shannon.age)
-
##   Tukey multiple comparisons of means
-##     95% family-wise confidence level
-## 
-## Fit: aov(formula = shannon ~ AgeGroup, data = meta)
-## 
-## $AgeGroup
-##             diff        lwr       upr   p adj
-## 2w-1yr -3.270063 -3.8446230 -2.695503 0.0e+00
-## 8w-1yr -1.830903 -2.4054628 -1.256342 2.0e-07
-## 8w-2w   1.439160  0.8646001  2.013720 8.5e-06
-

We clearly see that all age groups have significantly different diversity. When we plot the data, we see that diversity increases as the animals age.

-
#Re-order the groups because the default is 1yr-2w-8w
-meta$AgeGroup.ord = factor(meta$AgeGroup, c("2w","8w","1yr"))
-#Return the plot area to 1x1
-par(mfrow = c(1, 1))
-#Plot
-boxplot(shannon ~ AgeGroup.ord, data=meta, ylab="Shannon's diversity")
-

-

Non-normally distributed metrics

-

We will use Chao’s richness estimate here. Since age is categorical, we use Kruskal-Wallis (non-parametric equivalent of ANOVA). If we have only two levels, we would run Wilcoxon rank sum test (non-parametric equivalent of t-test)

-
kruskal.test(chao ~ AgeGroup, data=meta)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  chao by AgeGroup
-## Kruskal-Wallis chi-squared = 19.28, df = 2, p-value = 6.507e-05
-

We can test pairwise within the age groups with Wilcoxon Rank Sum Tests. This test has a slightly different syntax than our other tests

-
pairwise.wilcox.test(meta$chao, meta$AgeGroup, p.adjust.method="fdr")
-
## 
-##  Pairwise comparisons using Wilcoxon rank sum test 
-## 
-## data:  meta$chao and meta$AgeGroup 
-## 
-##    1yr     2w     
-## 2w 0.00023 -      
-## 8w 0.00023 0.00186
-## 
-## P value adjustment method: fdr
-

Like diversity, we see that richness also increases with age.

-
#Create 1x1 plot environment
-par(mfrow = c(1, 1))
-#Plot
-boxplot(chao ~ AgeGroup.ord, data=meta, ylab="Chao richness")
-

-
-
-

Continuous variables

-

For continuous variables, we use general linear models, specifying the distribution that best fits our data.

-

Normally distributed metrics

-

Since ADG is a continuous variable, we run a general linear model. We will again use Shannon’s diversity as our roughly normal metric. The default of glm and lm is the normal distribution so we don’t have to specify anything.

-

Does ADG impact the Shannon diversity of the fecal microbiota?

-
glm.shannon.ADG = glm(shannon ~ ADGKG, data=meta)
-summary(glm.shannon.ADG)
-
## 
-## Call:
-## glm(formula = shannon ~ ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##      Min        1Q    Median        3Q       Max  
-## -2.49110  -1.11216  -0.01749   1.53658   1.84728  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)   
-## (Intercept)  3.62565    1.01390   3.576  0.00169 **
-## ADGKG       -0.03407    0.97805  -0.035  0.97253   
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 2.151815)
-## 
-##     Null deviance: 47.343  on 23  degrees of freedom
-## Residual deviance: 47.340  on 22  degrees of freedom
-## AIC: 90.412
-## 
-## Number of Fisher Scoring iterations: 2
-

The output let’s us know that the intercept of our model is significantly different from 0 but our slope (e.g. our variable of interest) is not. This makes sense when we look at the data.

-
plot(shannon ~ ADGKG, data=meta)
-#Add the glm best fit line
-abline(glm.shannon.ADG)
-

-

Non-normally distributed metrics

-

We will again use a general linear model for our non-normally distributed metric Chao. However, this time, we change the distribution from normal to something that fits the data better.

-

But which distribution should we choose? In statistics, there is no one “best” model. There are only good and better models. We will use the plot() function to compare two models and pick the better one.

-

First, the Gaussian (normal) distribution, which we already know is a bad fit.

-
gaussian.chao.ADG = glm(chao ~ ADGKG, data=meta, family="gaussian")
-par(mfrow = c(1,2))
-plot(gaussian.chao.ADG, which=c(1,2))
-

-

Quasipoisson (log) distribution

-
qp.chao.ADG = glm(chao ~ ADGKG, data=meta, family="quasipoisson")
-par(mfrow = c(1,2))
-plot(qp.chao.ADG, which=c(1,2))
-

-

What we’re looking for is no pattern in the Residuals vs. Fitted graph (“stars in the sky”), which shows that we picked a good distribution family to fit our data. We also want our residuals to be normally distributed, which is shown by most/all of the points falling on the line in the Normal Q-Q plot.

-

While it’s still not perfect, the quasipoisson fits much better with residuals on the order of 30 whereas gaussian was on the order of 600. So, we will use quasipoisson and see that ADG does not to correlate to Chao richness.

-
summary(qp.chao.ADG)
-
## 
-## Call:
-## glm(formula = chao ~ ADGKG, family = "quasipoisson", data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -24.36  -17.05  -10.66   18.81   26.91  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)   6.4528     0.5561  11.605 7.54e-11 ***
-## ADGKG        -0.1859     0.5438  -0.342    0.736    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 374.2485)
-## 
-##     Null deviance: 8117.2  on 23  degrees of freedom
-## Residual deviance: 8074.4  on 22  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 5
-

Plotting this we see that, indeed, there is not signficant correlation between Chao and ADG.

-
#Return the plot area to 1x1
-par(mfrow = c(1, 1))
-#Plot
-plot(log(chao) ~ ADGKG, data=meta, ylab="ln(Chao's richness)")
-abline(qp.chao.ADG)
-

-
-
-

Mixed models

-

Our two variables may not be fully independent and therefore, running them in two separate tests may not be correct. That is to say, age may impact ADG. In fact, I know this is the case because calves (2w, 8w) gain weight more quickly than heifers (1yr).

-

Think about your variables and what they mean “in the real world.” Logically combine them into as few ANOVA tests as possible. In the end, it’s better to test a meaningless interaction (as it will most likely not be significant) than not test a meaningful one.

-

We can test if the interaction of age and ADG impacts diversity with a model that includes both of our variables. The * symbol is a shortcut for models. A*B is equivalent to A + B + A:B

-
aov.shannon.all = aov(shannon ~ AgeGroup*ADGKG, data=meta)
-summary(aov.shannon.all)
-
##                Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup        2  42.98  21.489  95.472 2.61e-10 ***
-## ADGKG           1   0.05   0.054   0.239    0.631    
-## AgeGroup:ADGKG  2   0.26   0.130   0.576    0.572    
-## Residuals      18   4.05   0.225                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

We can see that the interaction of age and ADG doesn’t significantly impact Shannon diversity, So we should remove that variable to simplify our model. If you had many interaction terms, you would step-wise remove the one with the highest P-value until you had the simplest model with only individual variables and significant interaction terms.

-
aov.shannon.all2 = aov(shannon ~ AgeGroup+ADGKG, data=meta)
-summary(aov.shannon.all2)
-
##             Df Sum Sq Mean Sq F value   Pr(>F)    
-## AgeGroup     2  42.98  21.489   99.70 3.96e-11 ***
-## ADGKG        1   0.05   0.054    0.25    0.623    
-## Residuals   20   4.31   0.216                     
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Overall, the ANOVA test tells us that only age impacts Shannon diversity but it does not tell us which age groups differ from one another. If all of our variables were categorical, we could run TukeyHSD like we did with age only.

-
TukeyHSD(aov.shannon.all)
-
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
-## ADGKG
-
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
-## AgeGroup, ADGKG
-
## Warning in TukeyHSD.aov(aov.shannon.all): 'which' specified some non-
-## factors which will be dropped
-
##   Tukey multiple comparisons of means
-##     95% family-wise confidence level
-## 
-## Fit: aov(formula = shannon ~ AgeGroup * ADGKG, data = meta)
-## 
-## $AgeGroup
-##             diff       lwr       upr    p adj
-## 2w-1yr -3.270063 -3.875469 -2.664657 0.00e+00
-## 8w-1yr -1.830903 -2.436309 -1.225496 1.20e-06
-## 8w-2w   1.439160  0.833754  2.044567 2.81e-05
-

However, you will see that we don’t get any data from ADG since it is continuous. There is an error denoting this as “non-factors ignored: ADGKG”

-

So, we should have run our test as a glm since we have at least one continuous variable. First, we will still include the interaction variable to see that type of output.

-
glm.shannon.all = glm(shannon ~ AgeGroup*ADGKG, data=meta)
-summary(glm.shannon.all)
-
## 
-## Call:
-## glm(formula = shannon ~ AgeGroup * ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##     Min       1Q   Median       3Q      Max  
-## -1.0301  -0.2468   0.0894   0.1572   0.7624  
-## 
-## Coefficients:
-##                  Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)        5.7123     2.5928   2.203   0.0409 *
-## AgeGroup2w        -3.3969     2.6197  -1.297   0.2111  
-## AgeGroup8w        -2.9610     2.7554  -1.075   0.2967  
-## ADGKG             -0.4481     2.7599  -0.162   0.8728  
-## AgeGroup2w:ADGKG   0.1228     2.7848   0.044   0.9653  
-## AgeGroup8w:ADGKG   1.0750     2.8763   0.374   0.7130  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 0.22508)
-## 
-##     Null deviance: 47.3425  on 23  degrees of freedom
-## Residual deviance:  4.0514  on 18  degrees of freedom
-## AIC: 39.413
-## 
-## Number of Fisher Scoring iterations: 2
-

Now this output is saying the same thing as ANOVA but in a more complicated way. The function automatically picks a reference group for categorical variables (in this case, 1yr) to compare all other groups to. Let’s go through each line

-
    -
  • (Intercept) - This is whether or not the y-intercept is 0. A significant P-value indicates that the intercept is not 0, and we wouldn’t expect it to be for any alpha-diversity metric since 0 means nothing is there

  • -
  • AgeGroup2w - the difference between Shannon when Age = 2w vs. 1yr (the same as testing “shannon ~ AgeGroup” and only looking at the 2w-1yr pairwise comparison)
  • -
  • AgeGroup8w - the same as 2w but now looking at only the 8w-1yr comparison

  • -
  • ADGKG - the slope of shannon to ADGKG (the same as testing “shannon ~ ADGKG”)

  • -
  • AgeGroup2w:ADGKG - the difference in slope of shannon ~ ADG between ages 2w and 1yr
  • -
  • AgeGroup8w:ADGKG - the difference in slope of shannon ~ ADG between ages 8w and 1yr

  • -
-

As we saw in ANOVA, none of the interaction terms are significant so we remove them.

-
glm.shannon.all2 = glm(shannon ~ AgeGroup+ADGKG, data=meta)
-summary(glm.shannon.all2)
-
## 
-## Call:
-## glm(formula = shannon ~ AgeGroup + ADGKG, data = meta)
-## 
-## Deviance Residuals: 
-##      Min        1Q    Median        3Q       Max  
-## -0.95299  -0.25858   0.07643   0.30409   0.74487  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)   5.4459     0.3487  15.619 1.14e-12 ***
-## AgeGroup2w   -3.2760     0.2324 -14.094 7.55e-12 ***
-## AgeGroup8w   -1.7989     0.2408  -7.471 3.30e-07 ***
-## ADGKG        -0.1639     0.3281  -0.500    0.623    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 0.2155447)
-## 
-##     Null deviance: 47.3425  on 23  degrees of freedom
-## Residual deviance:  4.3109  on 20  degrees of freedom
-## AIC: 36.903
-## 
-## Number of Fisher Scoring iterations: 2
-

Note: The full glm model with the interaction term included did not show age as significant. When we remove the interaction term, age is significant. This is why you should remove non-significant interactions terms as they can the mask main effects of individual variables.

-

We can run a similar test with non-normal data like Chao.

-
qp.chao.all = glm(chao ~ AgeGroup*ADGKG, data=meta, family="quasipoisson")
-summary(qp.chao.all)
-
## 
-## Call:
-## glm(formula = chao ~ AgeGroup * ADGKG, family = "quasipoisson", 
-##     data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -7.774  -3.430  -0.140   3.692   5.277  
-## 
-## Coefficients:
-##                  Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)       6.99825    0.71122   9.840 1.14e-08 ***
-## AgeGroup2w       -1.61539    0.75272  -2.146   0.0458 *  
-## AgeGroup8w       -2.24498    0.86846  -2.585   0.0187 *  
-## ADGKG             0.01751    0.75699   0.023   0.9818    
-## AgeGroup2w:ADGKG -0.42295    0.80094  -0.528   0.6039    
-## AgeGroup8w:ADGKG  0.86269    0.86550   0.997   0.3321    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 18.86331)
-## 
-##     Null deviance: 8117.2  on 23  degrees of freedom
-## Residual deviance:  348.5  on 18  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 4
-

Remove the non-significant interaction.

-
qp.chao.all2 = glm(chao ~ AgeGroup+ADGKG, data=meta, family="quasipoisson")
-summary(qp.chao.all2)
-
## 
-## Call:
-## glm(formula = chao ~ AgeGroup + ADGKG, family = "quasipoisson", 
-##     data = meta)
-## 
-## Deviance Residuals: 
-##    Min      1Q  Median      3Q     Max  
-## -7.783  -3.452  -1.378   3.744   8.184  
-## 
-## Coefficients:
-##             Estimate Std. Error t value Pr(>|t|)    
-## (Intercept)  7.03944    0.23567  29.870  < 2e-16 ***
-## AgeGroup2w  -1.98090    0.14862 -13.329 2.08e-11 ***
-## AgeGroup8w  -1.24286    0.11926 -10.422 1.57e-09 ***
-## ADGKG       -0.02643    0.24530  -0.108    0.915    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for quasipoisson family taken to be 23.74583)
-## 
-##     Null deviance: 8117.20  on 23  degrees of freedom
-## Residual deviance:  476.31  on 20  degrees of freedom
-## AIC: NA
-## 
-## Number of Fisher Scoring iterations: 4
-

From all of this, we can conclude that the fecal microbiota increases in diversity and richness as dairy cows age. Animal growth as measured by ADG does not correlate with fecal community diversity or richness.

-
-
-
-

Beta-diversity

-

Beta-diversity is between sample diversity. It is how different every sample is from every other sample. Thus, each sample has more than one value. Some metrics take abundance into account (i.e. diversity: Bray-Curtis, weighted UniFrac) and some only calculate based on presence-absence (i.e. richness: Jaccard, unweighted UniFrac).

-

Beta-diversity appears like the following (completely made-up numbers)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
.sample1sample2sample3
sample100.3450.194
sample20.34500.987
sample30.1940.9870
-
-

Visualization

-

The best way to visualize beta-diversity, or how different samples are from each other, is by non-metric multidimensional scaling (nMDS). This is similar to principle coordinate analysis or PCA/PCoA if you’ve heard of that, only nMDS is more statistically robust with multiple iterations in the form of the trymax part of the command.

-

Each symbol on an nMDS plot represents the total microbial community of that sample. Symbols closer together have more similar microbiotas while those farther apart have less similar.

-
-

OTU-based metrics

-

There are two main type of beta-diversity measures. These OTU-based metrics treat every OTU as a separate entity without taking taxonomy into account. The distance between Prevotella OTU1 and Prevotella OTU2 is equivalent to the distance between Prevotella OTU1 and Bacteroides OTU1.

-
-

Dot plots

-

First, we calculate the nMDS values for a 2-axis k=2 graph using the OTU-based Bray-Curtis metric that takes into account both the presence/absence and abundance of OTUs in your samples (i.e. diversity). This uses the metaMDS function from the package vegan.

-
BC.nmds = metaMDS(OTU.clean, distance="bray", k=2, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.06208161 
-## Run 1 stress 0.06210668 
-## ... Procrustes: rmse 0.001636313  max resid 0.005662513 
-## ... Similar to previous best
-## Run 2 stress 0.06208261 
-## ... Procrustes: rmse 0.0008174643  max resid 0.00186259 
-## ... Similar to previous best
-## Run 3 stress 0.06208133 
-## ... New best solution
-## ... Procrustes: rmse 0.000495613  max resid 0.001143981 
-## ... Similar to previous best
-## Run 4 stress 0.06208228 
-## ... Procrustes: rmse 0.0002768028  max resid 0.0006083455 
-## ... Similar to previous best
-## Run 5 stress 0.06208254 
-## ... Procrustes: rmse 0.0003377152  max resid 0.0007457908 
-## ... Similar to previous best
-## Run 6 stress 0.06208233 
-## ... Procrustes: rmse 0.000285801  max resid 0.000626649 
-## ... Similar to previous best
-## Run 7 stress 0.06210685 
-## ... Procrustes: rmse 0.001453303  max resid 0.005539077 
-## ... Similar to previous best
-## Run 8 stress 0.062104 
-## ... Procrustes: rmse 0.001430176  max resid 0.005147467 
-## ... Similar to previous best
-## Run 9 stress 0.06208351 
-## ... Procrustes: rmse 0.0005018534  max resid 0.00111944 
-## ... Similar to previous best
-## Run 10 stress 0.06208269 
-## ... Procrustes: rmse 0.0003614257  max resid 0.0008024269 
-## ... Similar to previous best
-## Run 11 stress 0.06208154 
-## ... Procrustes: rmse 0.0004861021  max resid 0.001120926 
-## ... Similar to previous best
-## Run 12 stress 0.06212707 
-## ... Procrustes: rmse 0.001859292  max resid 0.005339963 
-## ... Similar to previous best
-## Run 13 stress 0.3702005 
-## Run 14 stress 0.06210406 
-## ... Procrustes: rmse 0.001425256  max resid 0.00512563 
-## ... Similar to previous best
-## Run 15 stress 0.06208142 
-## ... Procrustes: rmse 3.189023e-05  max resid 6.612762e-05 
-## ... Similar to previous best
-## Run 16 stress 0.06210429 
-## ... Procrustes: rmse 0.001578454  max resid 0.005195898 
-## ... Similar to previous best
-## Run 17 stress 0.06210796 
-## ... Procrustes: rmse 0.00155285  max resid 0.005626229 
-## ... Similar to previous best
-## Run 18 stress 0.06208191 
-## ... Procrustes: rmse 0.0001981339  max resid 0.0004391198 
-## ... Similar to previous best
-## Run 19 stress 0.06208168 
-## ... Procrustes: rmse 0.0001331311  max resid 0.000291077 
-## ... Similar to previous best
-## Run 20 stress 0.06210592 
-## ... Procrustes: rmse 0.001396183  max resid 0.005412384 
-## ... Similar to previous best
-## *** Solution reached
-

We see that we reached a convergent solution around 20 iterations and our stress is very low (0.06), meaning that 2-axis are sufficient to view the data.

-

Then plot the nMDS with different colors for your different groups of interest. We will use colors for our three ages

-
par(mfrow = c(1, 1))
-#Create a blank plot for the nmds
-plot(BC.nmds, type="n", main="Bray-Curtis")
-#Add the points colored by age
-points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-#Add a legend
-legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

This will create a plot in the lower right quadrant. If you want to get fancy, type “?plot” in the console to see other ways to modify the plot function.

-

A similar thing can be done for the Jaccard metric, which only takes into account presence/absence (i.e. richness).

-
J.nmds = metaMDS(OTU.clean, distance="jaccard", k=2, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.0620818 
-## Run 1 stress 0.06208178 
-## ... New best solution
-## ... Procrustes: rmse 0.0007016851  max resid 0.001623036 
-## ... Similar to previous best
-## Run 2 stress 0.06210633 
-## ... Procrustes: rmse 0.001409348  max resid 0.005467011 
-## ... Similar to previous best
-## Run 3 stress 0.06210745 
-## ... Procrustes: rmse 0.001470069  max resid 0.00557513 
-## ... Similar to previous best
-## Run 4 stress 0.06208144 
-## ... New best solution
-## ... Procrustes: rmse 0.0001309513  max resid 0.0002717662 
-## ... Similar to previous best
-## Run 5 stress 0.06208156 
-## ... Procrustes: rmse 5.349512e-05  max resid 0.0001195792 
-## ... Similar to previous best
-## Run 6 stress 0.06208137 
-## ... New best solution
-## ... Procrustes: rmse 2.027381e-05  max resid 4.710602e-05 
-## ... Similar to previous best
-## Run 7 stress 0.06208345 
-## ... Procrustes: rmse 0.0004560942  max resid 0.001010311 
-## ... Similar to previous best
-## Run 8 stress 0.06210681 
-## ... Procrustes: rmse 0.001448074  max resid 0.005531499 
-## ... Similar to previous best
-## Run 9 stress 0.06208334 
-## ... Procrustes: rmse 0.0004470347  max resid 0.000984174 
-## ... Similar to previous best
-## Run 10 stress 0.06208155 
-## ... Procrustes: rmse 7.705878e-05  max resid 0.0001651192 
-## ... Similar to previous best
-## Run 11 stress 0.06208217 
-## ... Procrustes: rmse 0.0002412108  max resid 0.0005340427 
-## ... Similar to previous best
-## Run 12 stress 0.06210429 
-## ... Procrustes: rmse 0.001420012  max resid 0.005133791 
-## ... Similar to previous best
-## Run 13 stress 0.06208263 
-## ... Procrustes: rmse 0.0002884997  max resid 0.0006395557 
-## ... Similar to previous best
-## Run 14 stress 0.06208166 
-## ... Procrustes: rmse 0.0001135875  max resid 0.0002424163 
-## ... Similar to previous best
-## Run 15 stress 0.06210651 
-## ... Procrustes: rmse 0.001438738  max resid 0.005503184 
-## ... Similar to previous best
-## Run 16 stress 0.06208137 
-## ... New best solution
-## ... Procrustes: rmse 6.516686e-05  max resid 0.0001605969 
-## ... Similar to previous best
-## Run 17 stress 0.06208244 
-## ... Procrustes: rmse 0.0002976643  max resid 0.0007159927 
-## ... Similar to previous best
-## Run 18 stress 0.06208222 
-## ... Procrustes: rmse 0.0002618419  max resid 0.0006358936 
-## ... Similar to previous best
-## Run 19 stress 0.06208197 
-## ... Procrustes: rmse 0.000208525  max resid 0.0005678922 
-## ... Similar to previous best
-## Run 20 stress 0.0620832 
-## ... Procrustes: rmse 0.0004189108  max resid 0.0009707012 
-## ... Similar to previous best
-## *** Solution reached
-
plot(J.nmds, type="n", main="Jaccard")
-points(J.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-3, 1.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

You see that the values are very different for Jaccard but the pattern of points is very similar to Bray-Curtis. This is because Jaccard is a transformation of Bray-Curtis with J = 2BC/(1+BC)

-
-
-

Ellipses

-

You can also plot standard error (se) ellipses for your nmds data instead of showing all of the individual points. Here, we will plot 99% confidence se ellipses for the Bray-Curtis metric using ordiellipse from vegan.

-

Code courtesy of Madison Cox.

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-legend(-5.5, 2.5, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-
-#Add an ellipse for 2w
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-
-#Add an ellipse for 8w
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-
-#Add an ellipse for 1yr
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

We clearly see in both the dot and ellipse plots that age significantly impacts the overall structure (Bray-Curtis) and composition (Jaccard) of the fecal bacterial microbiota.

-
-
-

3D plots

-

If your stress is high (like over 0.3) for your metaMDS calculation, you probably need to increase to 3 axes k=3. Graphing a 3D plot is much more complicated, and there are a number of packages that could be used. Here, we will use one option from the plotly package to visualize a 3D Bray-Curtis plot.

-
#Calculate the Bray-Curtis nMDS for 3-axis
-BC.nmds.3D = metaMDS(OTU.clean, distance="bray", k=3, trymax=1000)
-
## Square root transformation
-## Wisconsin double standardization
-## Run 0 stress 0.04686346 
-## Run 1 stress 0.04741659 
-## Run 2 stress 0.04673425 
-## ... New best solution
-## ... Procrustes: rmse 0.01073904  max resid 0.0344814 
-## Run 3 stress 0.05061835 
-## Run 4 stress 0.04740131 
-## Run 5 stress 0.04984642 
-## Run 6 stress 0.04747801 
-## Run 7 stress 0.05226505 
-## Run 8 stress 0.05295437 
-## Run 9 stress 0.04741387 
-## Run 10 stress 0.0457586 
-## ... New best solution
-## ... Procrustes: rmse 0.03868237  max resid 0.1296728 
-## Run 11 stress 0.05094992 
-## Run 12 stress 0.04719303 
-## Run 13 stress 0.05012352 
-## Run 14 stress 0.04750204 
-## Run 15 stress 0.0479423 
-## Run 16 stress 0.04579561 
-## ... Procrustes: rmse 0.004692476  max resid 0.01495666 
-## Run 17 stress 0.05069634 
-## Run 18 stress 0.0485804 
-## Run 19 stress 0.05058189 
-## Run 20 stress 0.04859459 
-## Run 21 stress 0.04996713 
-## Run 22 stress 0.04740079 
-## Run 23 stress 0.04747632 
-## Run 24 stress 0.04675455 
-## Run 25 stress 0.04747574 
-## Run 26 stress 0.0486171 
-## Run 27 stress 0.04575823 
-## ... New best solution
-## ... Procrustes: rmse 0.0005374711  max resid 0.0008831403 
-## ... Similar to previous best
-## *** Solution reached
-

Extract x-y-z values for this nmds

-
BCxyz = scores(BC.nmds.3D, display="sites")
-#This is a table that looks like 
-BCxyz
-
##                 NMDS1       NMDS2        NMDS3
-## 5017.1yr.F -4.7973931  0.33029806 -0.211481225
-## 5017.2w.F   3.1867260  0.06208276  1.484970505
-## 5017.8w.F   1.0614871 -2.13025264 -1.218243774
-## 5020.1yr.F -4.7579235  0.24440345 -0.002888360
-## 5020.2w.F   3.4979230 -1.00981047  1.015200903
-## 5020.8w.F   1.5897780 -1.93435391  0.464128291
-## 5026.1yr.F -4.7720517  0.20611823  0.214815994
-## 5026.2w.F   3.3976411  1.10010056 -0.616957559
-## 5026.8w.F   3.1483050  2.07715934  1.478767471
-## 5031.1yr.F -4.8021402  0.44250394  0.202447638
-## 5031.2w.F   3.3537430  0.48376070 -1.490408346
-## 5031.8w.F   0.8577869 -1.64300786  0.250766536
-## 5037.1yr.F -4.8522745  0.48898068 -0.004218580
-## 5037.2w.F   3.6593056  0.26886383 -0.507062657
-## 5037.8w.F   3.1326413 -0.82210579 -0.024946820
-## 5041.1yr.F -4.7724198  0.28335210  0.060469429
-## 5041.2w.F   3.1661815  2.43615798 -1.218459457
-## 5041.8w.F   1.0947996 -2.58325770 -0.236659085
-## 5045.1yr.F -4.7522029  0.16444286  0.004405471
-## 5045.2w.F   1.5110480  3.11956405 -0.469494555
-## 5045.8w.F   1.4900615 -2.17087166 -0.450930039
-## 5053.1yr.F -4.8259682  0.39929033 -0.016428020
-## 5053.2w.F   3.2932453  2.30299477  0.813801957
-## 5053.8w.F   0.8917011 -2.11641360  0.478404284
-

Plot the xyz coordinates and color by age

-
plot_ly(x=BCxyz[,1], y=BCxyz[,2], z=BCxyz[,3], type="scatter3d", mode="markers", color=meta$AgeGroup, colors=c("blue", "green", "red"))
-
- -

Note: Since 3D plots are difficult to interpret in printed journal articles, many authors choose to create two separate 2D plots to show the 3D data like so.

-
par(mfrow=c(1,2))
-#Axis 1 and 2 (x and y)
-plot(BCxyz[,1], BCxyz[,2], main="Bray-Curtis 1:2", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-5.4, 3, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Axis 1 and 3 (x and z)
-plot(BCxyz[,1], BCxyz[,3], main="Bray-Curtis 1:3", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-

-
-
-
-

Phylogentic-based metrics

-

The most common of this type of beta-diversity metrics is UniFrac. The strength of UniFrac over Bray-Curtis or Jaccard is that it takes into account phylogenetic relationships of the species present in the microbiota. Thus, samples with different OTUs from the same genus will be more similar by UniFrac that those with OTUs from different genera. The weakness is that UniFrac is more sensitive to low abundance OTUs and those that a very phylogenetically distant.

-

Your choice will depend on how much you personally feel phylogenetic relationships vs. sensitively matter in your data.

-

Just as above, UniFrac can be plotted as an nMDS. You just need to use a different R package, and thus, slightly different commands.

-
-

Create physeq object

-

To start, you must make a phyloseq object which includes the OTU.clean, meta, and tax.clean data. We tell R which tables are each type

-
OTU.UF = otu_table(as.matrix(OTU.clean), taxa_are_rows=FALSE)
-tax.UF = tax_table(as.matrix(tax.clean))
-meta.UF = sample_data(meta)
-

We then merge these into an object of class phyloseq.

-
physeq = phyloseq(OTU.UF, tax.UF, meta.UF)
-

To add the phylogenetic component to UniFrac, we calculate a rooted phylogenetic tree of our OTUs. This takes a long time so we have provided the tree for you.

-

However, if we were to calculate a tree, first, we import a distance matrix created from representative sequences of our OTUs. We would use phangorn to read the file as it was created in mothur as seen under “Trees of OTUs” here.

-

DO NOT RUN THIS

-
dist.mat = import_mothur_dist("clean_repFasta.phylip.dist")
-

We would then calculate a rooted neighbor-joining tree from the distance matrix using the ape package.

-

DO NOT RUN THIS

-
NJ.tree = bionj(dist.mat)
-

Instead, we have pre-calculated this tree and you can load is with

-
load("NJ.tree.Rdata")
-

Then, add this tree to your physeq object. This object will be what is used in UniFrac calculations.

-
physeq.tree = merge_phyloseq(physeq, NJ.tree)
-

We can look at this object and see its components.

-
physeq.tree
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 5002 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 5002 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 5002 tips and 5000 internal nodes ]
-
-
-

Dot plots

-

Calculate weighted UniFrac (i.e. diversity) distances and ordinate into an nMDS. We specify weighted with weighted=TRUE.

-
wUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=TRUE)
-
## Warning in UniFrac(physeq, ...): Randomly assigning root as -- Otu00062 --
-## in the phylogenetic tree in the data you provided.
-
## Run 0 stress 0.0864543 
-## Run 1 stress 0.08645377 
-## ... New best solution
-## ... Procrustes: rmse 0.0001213931  max resid 0.0003141587 
-## ... Similar to previous best
-## Run 2 stress 0.1335727 
-## Run 3 stress 0.1463023 
-## Run 4 stress 0.08645329 
-## ... New best solution
-## ... Procrustes: rmse 0.0007206919  max resid 0.001920389 
-## ... Similar to previous best
-## Run 5 stress 0.1270238 
-## Run 6 stress 0.1157455 
-## Run 7 stress 0.1143571 
-## Run 8 stress 0.1317677 
-## Run 9 stress 0.08645345 
-## ... Procrustes: rmse 5.804039e-05  max resid 0.0001620988 
-## ... Similar to previous best
-## Run 10 stress 0.08808605 
-## Run 11 stress 0.08645348 
-## ... Procrustes: rmse 0.000642139  max resid 0.001706552 
-## ... Similar to previous best
-## Run 12 stress 0.1157451 
-## Run 13 stress 0.0864534 
-## ... Procrustes: rmse 4.051435e-05  max resid 0.0001125382 
-## ... Similar to previous best
-## Run 14 stress 0.1143564 
-## Run 15 stress 0.08659435 
-## ... Procrustes: rmse 0.004251655  max resid 0.01804703 
-## Run 16 stress 0.1295296 
-## Run 17 stress 0.0864538 
-## ... Procrustes: rmse 0.000161137  max resid 0.0004585026 
-## ... Similar to previous best
-## Run 18 stress 0.1347981 
-## Run 19 stress 0.08645297 
-## ... New best solution
-## ... Procrustes: rmse 0.0003657154  max resid 0.0008934259 
-## ... Similar to previous best
-## Run 20 stress 0.08808625 
-## *** Solution reached
-

You can plot UniFrac nMDS using the basic plot function as we’ve done before.

-
par(mfrow=c(1,1))
-plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(0.3,0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-

-

But let’s also look at the ggplot2 package. This package is incredibly powerful and can be customized in many ways. This document has many helpful tips.

-
plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  ggtitle("Weighted UniFrac")
-

-

Unweighted UniFrac (i.e. richness) can be visualized in the same way. We specify unweighted with weighted=FALSE.

-
uwUF.ordu = ordinate(physeq.tree, method="NMDS", distance="unifrac", weighted=FALSE)
-
## Warning in UniFrac(physeq, ...): Randomly assigning root as -- Otu00541 --
-## in the phylogenetic tree in the data you provided.
-
## Run 0 stress 9.695153e-05 
-## Run 1 stress 9.657832e-05 
-## ... New best solution
-## ... Procrustes: rmse 7.750783e-05  max resid 0.0002776914 
-## ... Similar to previous best
-## Run 2 stress 9.871795e-05 
-## ... Procrustes: rmse 8.086551e-05  max resid 0.0002819207 
-## ... Similar to previous best
-## Run 3 stress 9.488623e-05 
-## ... New best solution
-## ... Procrustes: rmse 7.261501e-05  max resid 0.0002642816 
-## ... Similar to previous best
-## Run 4 stress 9.862006e-05 
-## ... Procrustes: rmse 1.701217e-05  max resid 5.025527e-05 
-## ... Similar to previous best
-## Run 5 stress 9.806631e-05 
-## ... Procrustes: rmse 0.0001070473  max resid 0.0002353732 
-## ... Similar to previous best
-## Run 6 stress 9.757454e-05 
-## ... Procrustes: rmse 3.985665e-05  max resid 0.0001388531 
-## ... Similar to previous best
-## Run 7 stress 9.826177e-05 
-## ... Procrustes: rmse 9.722135e-05  max resid 0.0002191936 
-## ... Similar to previous best
-## Run 8 stress 9.695708e-05 
-## ... Procrustes: rmse 7.448687e-05  max resid 0.0002751687 
-## ... Similar to previous best
-## Run 9 stress 9.907648e-05 
-## ... Procrustes: rmse 9.310993e-05  max resid 0.0002388289 
-## ... Similar to previous best
-## Run 10 stress 9.984534e-05 
-## ... Procrustes: rmse 3.384419e-05  max resid 0.0001260377 
-## ... Similar to previous best
-## Run 11 stress 9.684607e-05 
-## ... Procrustes: rmse 0.0001319037  max resid 0.0003356478 
-## ... Similar to previous best
-## Run 12 stress 9.69891e-05 
-## ... Procrustes: rmse 8.404145e-06  max resid 2.447679e-05 
-## ... Similar to previous best
-## Run 13 stress 0.0002969569 
-## ... Procrustes: rmse 0.0003866364  max resid 0.0006715474 
-## ... Similar to previous best
-## Run 14 stress 9.723199e-05 
-## ... Procrustes: rmse 3.731826e-05  max resid 0.0001336343 
-## ... Similar to previous best
-## Run 15 stress 9.99257e-05 
-## ... Procrustes: rmse 0.0001270356  max resid 0.0003614341 
-## ... Similar to previous best
-## Run 16 stress 9.955355e-05 
-## ... Procrustes: rmse 6.056256e-05  max resid 0.0001673759 
-## ... Similar to previous best
-## Run 17 stress 9.589429e-05 
-## ... Procrustes: rmse 1.686683e-05  max resid 4.596185e-05 
-## ... Similar to previous best
-## Run 18 stress 9.633493e-05 
-## ... Procrustes: rmse 3.660483e-05  max resid 0.0001324208 
-## ... Similar to previous best
-## Run 19 stress 9.921893e-05 
-## ... Procrustes: rmse 1.085938e-05  max resid 1.669484e-05 
-## ... Similar to previous best
-## Run 20 stress 9.637055e-05 
-## ... Procrustes: rmse 6.450683e-05  max resid 0.0001970587 
-## ... Similar to previous best
-## *** Solution reached
-
## Warning in metaMDS(ps.dist): Stress is (nearly) zero - you may have
-## insufficient data
-
plot_ordination(physeq.tree, uwUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  ggtitle("Unweighted UniFrac")
-

-
-
-

Ellipses

-

Ellipses can be plotted instead of points as well. With the basic plot function:

-
plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
legend(0.3, 0.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-
-#Add an ellipse for 2w
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-
-#Add an ellipse for 8w
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-
-#Add an ellipse for 1yr
-ordiellipse(wUF.ordu, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

We can also plot ellipses in ggplot2. However, these ellipses are not the exact same at the standard error ellipses used with OTU-based metrics as they use different underlying calculations. However, they get at the same question of confidence intervals for groups of points on an nMDS.

-

We plot ellipses with ggplot2 by adding the stat_ellipse function to our plot.

-
plot_ordination(physeq.tree, wUF.ordu, type="sites", color="AgeGroup") + 
-  scale_colour_manual(values=c("2w"="green", "8w"="red", "1yr"="blue")) + 
-  theme_bw() + 
-  stat_ellipse() + 
-  ggtitle("Weighted UniFrac")
-

-
-
-

3D plots

-

3D UniFrac ordinations are not currently supported by phyloseq. We see that our ordinations only include 2 dimensions.

-
wUF.ordu
-
## 
-## Call:
-## metaMDS(comm = ps.dist) 
-## 
-## global Multidimensional Scaling using monoMDS
-## 
-## Data:     ps.dist 
-## Distance: user supplied 
-## 
-## Dimensions: 2 
-## Stress:     0.08645297 
-## Stress type 1, weak ties
-## Two convergent solutions found after 20 tries
-## Scaling: centring, PC rotation 
-## Species: scores missing
-
uwUF.ordu
-
## 
-## Call:
-## metaMDS(comm = ps.dist) 
-## 
-## global Multidimensional Scaling using monoMDS
-## 
-## Data:     ps.dist 
-## Distance: user supplied 
-## 
-## Dimensions: 2 
-## Stress:     9.488623e-05 
-## Stress type 1, weak ties
-## Two convergent solutions found after 20 tries
-## Scaling: centring, PC rotation 
-## Species: scores missing
-
-
-
-

Vectors for continuous variables

-

While it is easy to visualize categorical groups with coloring in nMDS, it is difficult to achieve the same effect with continuous variables. Instead, we can fit these variables as a vector on our nMDS plots.

-

To do this, we first fit the variables to our distances using the envfit function in vegan. You can do Bray-Curtis, Jaccard, weighted or unweighted UniFrac. Here, we will demonstrate with Bray-Curtis and weighted UniFrac.

-
fit.BC = envfit(BC.nmds, meta) 
-fit.BC
-
## 
-## ***VECTORS
-## 
-##             NMDS1    NMDS2     r2 Pr(>r)    
-## AgeExact -0.99887 -0.04744 0.9765  0.001 ***
-## ADGKG     0.12503  0.99215 0.0770  0.436    
-## chao     -0.98567  0.16868 0.9599  0.001 ***
-## shannon  -0.69400  0.71997 0.9469  0.001 ***
-## simpson   0.42087 -0.90712 0.7353  0.001 ***
-## ace      -0.99746  0.07129 0.9078  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##                   NMDS1   NMDS2
-## Animalcow5017   -0.1841  0.5449
-## Animalcow5020    0.0059  0.6577
-## Animalcow5026    0.4243 -0.8826
-## Animalcow5031   -0.2442  0.1175
-## Animalcow5037    0.4946 -0.0566
-## Animalcow5041    0.0500 -0.0290
-## Animalcow5045   -0.1374 -0.3384
-## Animalcow5053   -0.4090 -0.0134
-## AgeGroup1yr     -4.4470 -0.1800
-## AgeGroup2w       2.5047 -1.0509
-## AgeGroup8w       1.9422  1.2309
-## AgeGroup.ord2w   2.5047 -1.0509
-## AgeGroup.ord8w   1.9422  1.2309
-## AgeGroup.ord1yr -4.4470 -0.1800
-## 
-## Goodness of fit:
-##                  r2 Pr(>r)    
-## Animal       0.0248  1.000    
-## AgeGroup     0.9134  0.001 ***
-## AgeGroup.ord 0.9134  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

We see that it has automatically fit every variable in our meta table.

-

The simplest way around this is to just ask envfit to run on only the variables you want.

-
fit.BC = envfit(BC.nmds, meta[,c("AgeGroup", "ADGKG")])
-fit.BC
-
## 
-## ***VECTORS
-## 
-##         NMDS1   NMDS2    r2 Pr(>r)
-## ADGKG 0.12503 0.99215 0.077  0.488
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##               NMDS1   NMDS2
-## AgeGroup1yr -4.4470 -0.1800
-## AgeGroup2w   2.5047 -1.0509
-## AgeGroup8w   1.9422  1.2309
-## 
-## Goodness of fit:
-##              r2 Pr(>r)    
-## AgeGroup 0.9134  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

We repeat for weighted UniFrac

-
fit.wUF = envfit(wUF.ordu, meta[,c("AgeGroup", "ADGKG")])
-fit.wUF
-
## 
-## ***VECTORS
-## 
-##          NMDS1    NMDS2     r2 Pr(>r)
-## ADGKG -0.17846  0.98395 0.0398  0.651
-## Permutation: free
-## Number of permutations: 999
-## 
-## ***FACTORS:
-## 
-## Centroids:
-##               NMDS1   NMDS2
-## AgeGroup1yr -0.1076 -0.0834
-## AgeGroup2w   0.1432  0.0322
-## AgeGroup8w  -0.0356  0.0511
-## 
-## Goodness of fit:
-##              r2 Pr(>r)    
-## AgeGroup 0.5588  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-

For categorical variables, envfit will label the centroid of the data for each group in the nMDS with that group’s name. For continuous variables, it adds an arrow in the direction from smallest to largest value.

-

Note: The P-values for variables in envfit are not equivalent to the P-values for our ANOVA/Kruskal/GLM tests. Instead, envfit P-values tell you how well the arrow or centroids fit the x-y data of the nMDS, not the underlying distance matrix. In general, if your nMDS is a good representation of the data (low stress value) and the variable was significant in its appropriate ANOVA/Kruskal/GLM test, the fitted arrow/centroids will also be significant. And if your nMDS is a good representation of the data and the variable was not significant, the fitted arrow/centroids will also not be significant. We see this type of result here, but this will not always be the case.

-

However, if your nMDS stress was borderline or not great and/or your variable was borderline significant or not, you may see divergent results for the arrow/centroid. This does not mean that the result you got in ANOVA/Kruskal/GLM was invalid. It just means that it’s difficult to visualize this result as a simple arrow or centroids on a 2D plot. Regardless, non-significant variables in envfit that you know are signficant in other tests may still be represented on an nMDS as a visual aid.

-

Thus, we plot our 2D nMDS colored by age with an arrow for the ADG variable even though that arrow was not significant. Since the ADG variable was also not significant in GLM, we probably won’t use these plot in a publication, but it is good practice.

-

For Bray-Curtis:

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC, col="black")
-

-

You could also ask it to only plot variables with a fit P-value < 0.05. So we would only see the centroids

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, 2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC, col="black", p.max=0.05)
-

-

Weighted UniFrac

-
plot(wUF.ordu, type="n", main="Weighted UniFrac")
-
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
-## Species scores not available
-
points(wUF.ordu, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(.3,.15, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.wUF, col="black")
-

-

You could also fit your OTU.clean table to the nMDS to add arrow(s) for specific OTUs within the plot. OTU arrows that, say, go in the same direction as an age group centroid tend to increase in abundance in that age group. The opposite direction would indicate that an OTU decreases in abundance in that age group.

-

Fitting all OTUs would take awhile so we will only fit the first 10 in our table.

-
fit.BC.OTU = envfit(BC.nmds, OTU.clean[,1:10])
-fit.BC.OTU
-
## 
-## ***VECTORS
-## 
-##             NMDS1    NMDS2     r2 Pr(>r)    
-## Otu00001  0.71738 -0.69668 0.2478  0.035 *  
-## Otu00002  0.46984 -0.88275 0.2109  0.083 .  
-## Otu00003  0.25719 -0.96636 0.2503  0.028 *  
-## Otu00004  0.25006  0.96823 0.2738  0.025 *  
-## Otu00005  0.15473  0.98796 0.2910  0.011 *  
-## Otu00006 -0.96867  0.24837 0.6743  0.001 ***
-## Otu00007  0.17991 -0.98368 0.2488  0.011 *  
-## Otu00008  0.40157  0.91583 0.3108  0.022 *  
-## Otu00009  0.26275 -0.96487 0.1894  0.060 .  
-## Otu00010  0.33868 -0.94090 0.1552  0.102    
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-
#We will only plot significant arrows in this case
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC.OTU, col="black", p.max=0.05)
-

-

You could also think about plotting higher taxonomic levels like summed genera or family groups of OTUs.

-
#Extract all OTUs within the genus Ruminococcus
-OTU.Rumino = OTU.clean[,tax.clean$Genus == "g__Ruminococcus"]
-#Sum the abundances of the Ruminococcaceae OTUs into one variable (column)
-OTU.Rumino$Rumino.sum = rowSums(OTU.Rumino)
-
-#Fit the new Ruminococcaceae group
-fit.BC.Rumino = envfit(BC.nmds, OTU.Rumino$Rumino.sum)
-fit.BC.Rumino
-
## 
-## ***VECTORS
-## 
-##         NMDS1    NMDS2     r2 Pr(>r)    
-## [1,] -0.14506  0.98942 0.6621  0.001 ***
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## Permutation: free
-## Number of permutations: 999
-
#Plot
-plot(BC.nmds, type="n", main="Bray-Curtis")
-points(BC.nmds, pch=20, display="sites", col=c("blue", "green", "red")[meta$AgeGroup])
-legend(-6, -1.1, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-#Add fitted variables
-plot(fit.BC.Rumino, col="black", labels=c("Ruminococcus"))
-

-
-
-
-

Statistically test beta-diversity

-

While nMDS gives us a visual of beta-diversity, it does not test for statistical differences. We do this with permutational analysis of variance (PERMANOVA) or analysis of similarity (ANOSIM). These test whether the overall microbial community differs by your variable of interest.

-

You can run them with Bray-Curtis, Jaccard, weighted or unweighted UniFrac to answer different questions. For example, if your variable is significant for Bray-Curtis/weighted UniFrac but not Jaccard/unweighted UniFrac, this means your groups tend to have the same OTUs (richness) but different abundances of those OTUs (diversity). When variables are signficant for Bray-Curtis/Jaccard but not UniFrac, this indicates that your samples have different specific OTUs but similar taxa. Like group 1 has a lot of Prevotella OTU1 and group 2 has a lot of Prevotella OTU2, but they are both Prevotella so UniFrac treats them as being very similar.

-
-

PERMANOVA

-

For Bray-Curtis or Jaccard, we use the vegan package to calculate distances and run PERMANOVA. As with ANOVA/glm of alpha-diversity, we want to include all variables that could interact in one model.

-

Note: adonis cannot handle or account for NA or blanks in your data. Subset to only samples with complete metadata before running vegdist if these exist.

-
#Calculate distance and save as a matrix
-BC.dist=vegdist(OTU.clean, distance="bray")
-#Run PERMANOVA on distances.
-adonis(BC.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000, method = "bray")
-
## 
-## Call:
-## adonis(formula = BC.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000,      method = "bray") 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.9720 1.98600  8.0116 0.44481 0.000999 ***
-## ADGKG           1    0.1979 0.19791  0.7984 0.02216 0.646354    
-## AgeGroup:ADGKG  2    0.2976 0.14881  0.6003 0.03333 0.922078    
-## Residuals      18    4.4620 0.24789         0.49969             
-## Total          23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Similarly for Jaccard

-
J.dist=vegdist(OTU.clean, distance="jaccard")
-adonis(J.dist ~ AgeGroup*ADGKG, data = meta, permutations = 1000, method = "jaccard")
-
## 
-## Call:
-## adonis(formula = J.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000,      method = "jaccard") 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.9720 1.98600  8.0116 0.44481 0.000999 ***
-## ADGKG           1    0.1979 0.19791  0.7984 0.02216 0.590410    
-## AgeGroup:ADGKG  2    0.2976 0.14881  0.6003 0.03333 0.924076    
-## Residuals      18    4.4620 0.24789         0.49969             
-## Total          23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

We see that the interaction is not significant so we remove it.

-
adonis(BC.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000, method = "bray")
-
## 
-## Call:
-## adonis(formula = BC.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000,      method = "bray") 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.9720 1.98600  8.3451 0.44481 0.000999 ***
-## ADGKG      1    0.1979 0.19791  0.8316 0.02216 0.599401    
-## Residuals 20    4.7597 0.23798         0.53302             
-## Total     23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
adonis(J.dist ~ AgeGroup+ADGKG, data = meta, permutations = 1000, method = "jaccard")
-
## 
-## Call:
-## adonis(formula = J.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000,      method = "jaccard") 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.9720 1.98600  8.3451 0.44481 0.000999 ***
-## ADGKG      1    0.1979 0.19791  0.8316 0.02216 0.559441    
-## Residuals 20    4.7597 0.23798         0.53302             
-## Total     23    8.9296                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

For UniFrac, we use the phyloseq package to calculate distances and then vegan to run PERMANOVA.

-
wUF.dist = UniFrac(physeq.tree, weighted=TRUE, normalized=TRUE)
-
## Warning in UniFrac(physeq.tree, weighted = TRUE, normalized = TRUE):
-## Randomly assigning root as -- Otu00949 -- in the phylogenetic tree in the
-## data you provided.
-
adonis(wUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = wUF.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2   1.03525 0.51763  5.8901 0.36735 0.000999 ***
-## ADGKG           1   0.09908 0.09908  1.1275 0.03516 0.321678    
-## AgeGroup:ADGKG  2   0.10195 0.05098  0.5801 0.03618 0.873127    
-## Residuals      18   1.58185 0.08788         0.56131             
-## Total          23   2.81814                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
uwUF.dist = UniFrac(physeq.tree, weighted=FALSE, normalized=TRUE)
-
## Warning in UniFrac(physeq.tree, weighted = FALSE, normalized = TRUE):
-## Randomly assigning root as -- Otu04503 -- in the phylogenetic tree in the
-## data you provided.
-
adonis(uwUF.dist ~ AgeGroup*ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = uwUF.dist ~ AgeGroup * ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##                Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup        2    3.4929 1.74647  9.1892 0.47107 0.000999 ***
-## ADGKG           1    0.2360 0.23596  1.2415 0.03182 0.230769    
-## AgeGroup:ADGKG  2    0.2650 0.13248  0.6971 0.03573 0.813187    
-## Residuals      18    3.4210 0.19006         0.46137             
-## Total          23    7.4149                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-

Remove non-significant interaction term

-
adonis(wUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = wUF.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2   1.03525 0.51763  6.1483 0.36735 0.000999 ***
-## ADGKG      1   0.09908 0.09908  1.1769 0.03516 0.306693    
-## Residuals 20   1.68380 0.08419         0.59749             
-## Total     23   2.81814                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
adonis(uwUF.dist ~ AgeGroup+ADGKG, data=meta, permutations = 1000)
-
## 
-## Call:
-## adonis(formula = uwUF.dist ~ AgeGroup + ADGKG, data = meta, permutations = 1000) 
-## 
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Terms added sequentially (first to last)
-## 
-##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)    
-## AgeGroup   2    3.4929 1.74647  9.4762 0.47107 0.000999 ***
-## ADGKG      1    0.2360 0.23596  1.2803 0.03182 0.217782    
-## Residuals 20    3.6860 0.18430         0.49711             
-## Total     23    7.4149                 1.00000             
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
-
-

ANOSIM

-

If you have very different group sizes, you may consider analysis of similarities (ANOSIM) instead of PERMANOVA. This test does not assume equal group variances. However, it only allows simple 1 variable models with no interactions and can only be used for categorical (AgeGroup), not continuous (ADG) variables. So, ANOSIM has a lot of limitations and should only be used if you group sizes are very, very different, like 10 vs 100.

-

For example, Bray-Curtis:

-
anosim(BC.dist, meta$AgeGroup, permutations = 1000)
-
## 
-## Call:
-## anosim(dat = BC.dist, grouping = meta$AgeGroup, permutations = 1000) 
-## Dissimilarity: bray 
-## 
-## ANOSIM statistic R: 0.8467 
-##       Significance: 0.000999 
-## 
-## Permutation: free
-## Number of permutations: 1000
-

Overall, from the nMDS of various beta-diversity metrics (OTU- and phylogenetic-based) and statistical analyses, it is clear that age significantly impacts the fecal microbiota of dairy cows.

-
-
-

2D variables

-

These analyses are for comparing the microbiota to metadata that cannot fit in a single column and therefore, must be represented as a matrix of its own. For example, PERMANOVA can only tell you that the microbiota differs according to a single short chain fatty acid (SCFA), but other tests can tell you that the microbiota differs according to the overall SCFA profile. This section is also useful for comparing data if you have multiple OTU tables, like for bacteria, archaea, and fungi.

-

Mantel from vegan tests if two distance matrices co-vary e.g. does the data in matrix 1 change in the same way as the data in matrix 2. Like PERMANOVA, this test only tells you that the overall data co-vary, not which specific OTUs or SCFAs matter.

-

You can only compare samples were you have both types of data so we must subset our OTU table to only the samples that we also have SCFA for. The names are a little different between the tables so we also add “.F” to the SCFA names to make them match

-
OTU.SCFA = OTU.clean[row.names(OTU.clean) %in% paste(row.names(SCFA), ".F", sep=""),]
-

We then calculate distance matrices separately for each matrix. It is not necessary to do Bray-Curtis, Jaccard and UniFrac here since our SCFA data does not have any taxonomy to it.

-
dist1 = vegdist(OTU.SCFA)
-dist2 = vegdist(SCFA)
-

Run a Mantel test comparing the 2 matrices.

-
mantel(dist1, dist2, permutations=100)
-
## 'nperm' >= set of all permutations: complete enumeration.
-
## Set of permutations < 'minperm'. Generating entire set.
-
## 
-## Mantel statistic based on Pearson's product-moment correlation 
-## 
-## Call:
-## mantel(xdis = dist1, ydis = dist2, permutations = 100) 
-## 
-## Mantel statistic r: -0.02423 
-##       Significance: 0.54167 
-## 
-## Upper quantiles of permutations (null model):
-##   90%   95% 97.5%   99% 
-## 0.540 0.552 0.596 0.629 
-## Permutation: free
-## Number of permutations: 23
-

We see that the overall OTU table and SCFA tables do not co-vary.

-

You can also run Mantel on 3 matrices at once like so

-

Do not run as we do not have 3 matrices here

-
mantel.partial(dist1, dist2, dist3, permutations=100)
-
-
-
-

Beta dispersion

-

Sometimes it will be clear from nMDS that one group tends to vary more (be more spread out) than another group. You can test this statistically with multivariate homogeneity of group dispersion (variances).

-

Here is an example for Bray-Curtis. We use the same distance matrix we calculated for PERMANOVA/ANOSIM

-

Calculate dispersion (variances) within each group.

-
disp.age = betadisper(BC.dist, meta$AgeGroup)
-

Perform an ANOVA-like test to determine if the variances differ by groups.

-
permutest(disp.age, pairwise=TRUE, permutations=1000)
-
## 
-## Permutation test for homogeneity of multivariate dispersions
-## Permutation: free
-## Number of permutations: 1000
-## 
-## Response: Distances
-##           Df  Sum Sq  Mean Sq     F N.Perm   Pr(>F)    
-## Groups     2 0.47459 0.237293 30.93   1000 0.000999 ***
-## Residuals 21 0.16111 0.007672                          
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## Pairwise comparisons:
-## (Observed p-value below diagonal, permuted p-value above diagonal)
-##            1yr         2w     8w
-## 1yr            9.9900e-04 0.0010
-## 2w  4.8556e-06            0.7622
-## 8w  1.2886e-06 7.7206e-01
-

Combining this with our plot,

-
plot(BC.nmds, type="n", main="Bray-Curtis")
-legend(.6,-2, legend=c("2w","8w","1yr"), col=c("green","red","blue"), pch=20)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="green", draw="polygon", alpha=200, show.groups = c("2w"), border=FALSE)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="red", draw="polygon", alpha=200, show.groups = c("8w"), border=FALSE)
-ordiellipse(BC.nmds, groups=meta$AgeGroup, display="sites", kind="se", conf=0.99, label=FALSE, col="blue", draw="polygon", alpha=200, show.groups = c("1yr"), border=FALSE)
-

-

we see that 2 week and 8 week calves have similar variability in their fecal microbiotas but that both 2- and 8-week calves have more variable fecal microbiotas than 1-year heifers.

-
-
-
-

OTUs that differ by

-
-

Categorical variables

-

Just because the overall microbiota does or does not differ between age groups, does not mean specific OTUs do or don’t differ by age. However, it is inadvisable to just test all OTUs in your data set against all variables of interest. Since you are running multiple similar tests, you need to apply a false discovery rate (fdr) correctios and correcting across all OTUs (5002 in this data set) will most likely result in no significant results after fdr correction. Also, you don’t want to look at over 5000 P-values, do you?

-

There are a number of way to decrease the number of OTUs you’re looking at

-
    -
  1. Don’t use OTUs. Add together genus or family groups and test if all or some of these taxa differ across variables of interest
  2. -
  3. Apply an abundance cutoff such as only looking at OTUs/taxa that are at least 1% abundance in at least one sample
  4. -
  5. Apply a frequency cutoff such as only looking at OTUs/taxa that occur in at least 50% of samples
  6. -
  7. Combine 2 and 3
  8. -
-

However, some of these methods are somewhat arbitrary. How do you pick an abundance or frequency cutoff? What if a low abundant OTU is of interest? And what if you are interested in possible species-level differences (OTUs) so high taxonomic levels aren’t useful?

-

So, one way to non-arbitrarily select OTUs/taxa of interest is similarity percentages (SIMPER). SIMPER identifies the OTUs that most contribute to beta-diversity measures. These OTUs are the most abundant and/or most variable OTUs in the data set. Note: SIMPER outputs all pairwise comparisons (A-B, B-C, A-C, etc.) and thus, only works for categorical variables.

-

SIMPER’s output is a list of OTUs which cumulatively explain 70%+ of the variation between each comparison. The numbers below the OTUs are cumulative, so to get each OTU’s contribution, you must subtract the previous OTU’s value.

-

For example

-
simper(OTU.clean, meta$AgeGroup, permutations=100)
-
## cumulative contributions of most influential species:
-## 
-## $`1yr_2w`
-##  Otu00002  Otu00001  Otu00003  Otu00007  Otu00011  Otu00006  Otu00009 
-## 0.0983761 0.1627191 0.2225335 0.2657879 0.2982889 0.3271508 0.3514210 
-##  Otu00014  Otu00022  Otu00018  Otu00012  Otu00016  Otu00004  Otu00021 
-## 0.3660756 0.3793171 0.3924608 0.4048922 0.4171422 0.4283988 0.4385280 
-##  Otu00008  Otu00025  Otu00028  Otu00023  Otu00037  Otu00013  Otu00035 
-## 0.4479076 0.4565849 0.4646081 0.4723795 0.4790690 0.4857141 0.4920793 
-##  Otu00055  Otu00030  Otu00036  Otu00040  Otu00042  Otu00010  Otu00049 
-## 0.4983615 0.5045449 0.5106265 0.5166717 0.5226378 0.5274331 0.5321886 
-##  Otu00046  Otu00033  Otu00031  Otu00081  Otu00051  Otu00064  Otu00056 
-## 0.5368030 0.5413764 0.5458188 0.5500936 0.5543565 0.5582465 0.5620674 
-##  Otu00032  Otu00052  Otu00062  Otu00026  Otu00020  Otu00074  Otu00069 
-## 0.5657989 0.5695078 0.5730822 0.5765920 0.5799406 0.5831741 0.5864067 
-##  Otu00066  Otu00077  Otu00148  Otu00073  Otu00067  Otu00065  Otu00076 
-## 0.5895953 0.5927428 0.5958511 0.5989588 0.6020549 0.6051241 0.6081334 
-##  Otu00075  Otu00091  Otu00048  Otu00097  Otu00068  Otu00050  Otu00084 
-## 0.6111073 0.6140400 0.6169121 0.6196512 0.6223697 0.6250661 0.6277023 
-##  Otu00100  Otu00019  Otu00063  Otu00039  Otu00086  Otu00071  Otu00101 
-## 0.6303356 0.6329664 0.6355752 0.6381709 0.6406744 0.6431362 0.6455850 
-##  Otu00089  Otu00096  Otu00095  Otu00108  Otu00088  Otu00103  Otu00094 
-## 0.6480310 0.6504700 0.6528884 0.6553007 0.6576757 0.6600472 0.6624184 
-##  Otu00098  Otu00116  Otu00090  Otu00105  Otu00104  Otu00099  Otu00059 
-## 0.6647575 0.6670589 0.6693444 0.6716046 0.6738590 0.6760506 0.6781917 
-##  Otu00106  Otu00115  Otu00102  Otu00110  Otu00119  Otu00118  Otu00034 
-## 0.6803196 0.6824245 0.6844633 0.6865021 0.6884972 0.6904775 0.6924261 
-##  Otu00114  Otu00093  Otu00124  Otu00045 
-## 0.6943714 0.6962690 0.6981558 0.7000319 
-## 
-## $`1yr_8w`
-##   Otu00001   Otu00005   Otu00006   Otu00004   Otu00010   Otu00017 
-## 0.03765603 0.07335078 0.10010930 0.12226268 0.14087762 0.15688502 
-##   Otu00008   Otu00009   Otu00015   Otu00018   Otu00016   Otu00014 
-## 0.17205091 0.18718833 0.20107546 0.21456235 0.22713556 0.23964967 
-##   Otu00029   Otu00019   Otu00021   Otu00025   Otu00024   Otu00037 
-## 0.25102468 0.26162658 0.27202671 0.28093293 0.28829315 0.29516652 
-##   Otu00035   Otu00044   Otu00055   Otu00027   Otu00036   Otu00040 
-## 0.30170335 0.30821052 0.31465848 0.32109529 0.32733731 0.33354206 
-##   Otu00042   Otu00020   Otu00013   Otu00041   Otu00003   Otu00043 
-## 0.33966556 0.34564370 0.35158279 0.35717451 0.36261926 0.36799345 
-##   Otu00038   Otu00026   Otu00034   Otu00049   Otu00070   Otu00046 
-## 0.37334038 0.37836130 0.38334135 0.38822230 0.39310161 0.39783775 
-##   Otu00012   Otu00058   Otu00011   Otu00051   Otu00054   Otu00045 
-## 0.40234701 0.40670755 0.41102172 0.41521298 0.41939306 0.42353985 
-##   Otu00047   Otu00064   Otu00056   Otu00052   Otu00048   Otu00002 
-## 0.42764688 0.43163954 0.43556497 0.43937178 0.44313291 0.44683135 
-##   Otu00062   Otu00031   Otu00057   Otu00061   Otu00053   Otu00074 
-## 0.45050368 0.45405112 0.45759807 0.46109474 0.46455875 0.46787762 
-##   Otu00069   Otu00066   Otu00077   Otu00073   Otu00067   Otu00079 
-## 0.47119548 0.47447192 0.47770248 0.48089214 0.48406988 0.48721802 
-##   Otu00083   Otu00078   Otu00076   Otu00075   Otu00091   Otu00121 
-## 0.49033806 0.49342871 0.49651735 0.49956976 0.50257978 0.50549547 
-##   Otu00097   Otu00092   Otu00032   Otu00084   Otu00129   Otu00050 
-## 0.50830678 0.51111612 0.51389884 0.51660098 0.51922111 0.52181856 
-##   Otu00100   Otu00101   Otu00096   Otu00108   Otu00095   Otu00086 
-## 0.52434751 0.52686095 0.52936793 0.53184756 0.53429667 0.53674109 
-##   Otu00089   Otu00088   Otu00103   Otu00094   Otu00098   Otu00116 
-## 0.53918547 0.54162316 0.54405719 0.54649097 0.54889172 0.55125394 
-##   Otu00105   Otu00104   Otu00143   Otu00123   Otu00082   Otu00039 
-## 0.55357747 0.55589135 0.55819397 0.56049152 0.56278380 0.56503978 
-##   Otu00099   Otu00130   Otu00090   Otu00106   Otu00107   Otu00115 
-## 0.56728918 0.56953083 0.57176616 0.57395024 0.57611979 0.57828018 
-##   Otu00087   Otu00153   Otu00102   Otu00110   Otu00119   Otu00118 
-## 0.58042631 0.58252590 0.58461849 0.58671108 0.58875879 0.59079874 
-##   Otu00022   Otu00072   Otu00080   Otu00093   Otu00124   Otu00112 
-## 0.59281824 0.59481609 0.59678509 0.59873275 0.60067308 0.60260107 
-##   Otu00122   Otu00131   Otu00132   Otu00134   Otu00128   Otu00125 
-## 0.60450552 0.60639869 0.60828362 0.61014314 0.61199594 0.61383412 
-##   Otu00133   Otu00159   Otu00139   Otu00127   Otu00114   Otu00137 
-## 0.61566158 0.61747930 0.61928689 0.62106367 0.62282385 0.62455846 
-##   Otu00136   Otu00194   Otu00138   Otu00144   Otu00142   Otu00135 
-## 0.62629042 0.62801571 0.62974033 0.63143945 0.63312281 0.63480281 
-##   Otu00147   Otu00120   Otu00188   Otu00126   Otu00028   Otu00211 
-## 0.63647550 0.63814069 0.63980299 0.64140642 0.64300322 0.64457174 
-##   Otu00154   Otu00146   Otu00173   Otu00156   Otu00158   Otu00157 
-## 0.64612078 0.64764950 0.64917769 0.65068721 0.65217234 0.65364696 
-##   Otu00060   Otu00168   Otu00140   Otu00163   Otu00171   Otu00113 
-## 0.65508066 0.65651008 0.65793253 0.65931862 0.66069801 0.66207484 
-##   Otu00178   Otu00200   Otu00165   Otu00170   Otu00164   Otu00187 
-## 0.66344999 0.66480785 0.66616041 0.66748648 0.66881018 0.67012189 
-##   Otu00151   Otu00213   Otu00149   Otu00183   Otu00192   Otu00167 
-## 0.67141176 0.67269928 0.67397558 0.67525135 0.67652371 0.67778788 
-##   Otu00177   Otu00181   Otu00180   Otu00236   Otu00186   Otu00199 
-## 0.67904574 0.68029263 0.68151160 0.68272731 0.68393783 0.68512983 
-##   Otu00253   Otu00150   Otu00204   Otu00169   Otu00218   Otu00189 
-## 0.68632029 0.68750539 0.68867418 0.68982822 0.69097221 0.69210846 
-##   Otu00182   Otu00184   Otu00226   Otu00270   Otu00172   Otu00225 
-## 0.69323878 0.69436709 0.69548866 0.69660494 0.69770318 0.69878699 
-##   Otu00185   Otu00203 
-## 0.69986670 0.70093653 
-## 
-## $`2w_8w`
-##  Otu00002  Otu00001  Otu00003  Otu00007  Otu00009  Otu00005  Otu00011 
-## 0.1101390 0.1804133 0.2466786 0.2952479 0.3351854 0.3745198 0.4100899 
-##  Otu00004  Otu00010  Otu00017  Otu00008  Otu00012  Otu00015  Otu00022 
-## 0.4397781 0.4641945 0.4818672 0.4987872 0.5154942 0.5307997 0.5454777 
-##  Otu00029  Otu00013  Otu00019  Otu00020  Otu00028  Otu00006  Otu00023 
-## 0.5580145 0.5704325 0.5824230 0.5910912 0.5996473 0.6081657 0.6166261 
-##  Otu00024  Otu00027  Otu00031  Otu00044  Otu00030  Otu00041  Otu00043 
-## 0.6247348 0.6322130 0.6396626 0.6468237 0.6539027 0.6600291 0.6659522 
-##  Otu00038  Otu00032  Otu00026  Otu00070  Otu00033  Otu00034  Otu00047 
-## 0.6718453 0.6776585 0.6834157 0.6887933 0.6940870 0.6992933 0.7044391
-

We see a number of OTUs that may differ between 1 or more age comparisons. However, these are just the OTUs that most contribute to Bray-Curtis measures between our age groups. They are not necessarily significantly different.

-

To test significance, we compare the relative abundance of an OTU across our age groups with Kruskal-Wallis (OTU abundance is never normally distributed, trust me). For example, OTU1 occurs in all SIMPER age comparisons and does, in fact, significantly differ by age.

-
kruskal.test(OTU.clean$Otu00001 ~ meta$AgeGroup)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  OTU.clean$Otu00001 by meta$AgeGroup
-## Kruskal-Wallis chi-squared = 15.994, df = 2, p-value = 0.0003364
-

In contrast, OTU17 occurs in SIMPER but does not actually significantly differ by age group

-
kruskal.test(OTU.clean$Otu00017 ~ meta$AgeGroup)
-
## 
-##  Kruskal-Wallis rank sum test
-## 
-## data:  OTU.clean$Otu00017 by meta$AgeGroup
-## Kruskal-Wallis chi-squared = 4.9767, df = 2, p-value = 0.08305
-

Note: These P-values have not been corrected from false discovery rate (fdr) yet.

-

Now, it would be very tedious to individually test every variable of interest in SIMPER and then test every SIMPER OTU in Kruskal-Wallis. So, Andrew Steinberger (Suen lab) has written two scripts to simplify both SIMPER and Kruskal-Wallis of SIMPER OTUs. The latest versions can be found on his GitHub page and we have provided them for this workshop in /Steinberger_scripts

-

Disclaimer Andrew has provided these scripts out of the goodness of his heart and provides no guarentee that they will work for your exact data set or with new versions of R/RStudio/vegan. You may contact him through GitHub with issues or errors, but it is not his job to troubleshoot for you. He may or may not address your concerns in an updated version of the scripts at a later time.

-

The use of these scripts are as follows (from Steinberger GitHub with some modifications)

-

simper_pretty.R

-

This script is meant to rapidly perform the SIMPER function from the R package vegan for all comparisons of interest in a data set. Inputs are OTU and metadata tables, and the output is a .csv. User can tailor contents of .csv by setting perc_cutoff, low_cutoff, and low_val. This function can also handle taxonomic levels instead of OTU, but currently only select formats are compatible. Requires installation of the R package ‘vegan’.

-

Usage:

-

simper.pretty(x, metrics, c(‘interesting’), perc_cutoff=0.5, low_cutoff = ‘y’, low_val=0.01, ‘output_name’)

-

Inputs:

-
    -
  • x: OTU table
  • -
  • metrics: metadata table
  • -
  • interesting: a list of the column headers for the columns of interest in the metrics file. e.g. c(‘int1’,‘int2’,‘int3’)
  • -
  • perc_cutoff: % cutoff for output OTUs, as decimal (i.e. write 50% as 0.5), larger % increases number OTUs in output.
  • -
  • low_cutoff: ‘y’ if want to REMOVE OTUs that contribute less than 1%
  • -
  • low_val: set value of low cutoff (0.01), ignored if low_cutoff=‘n’.
  • -
  • output_name: the name that is appended to the output filename “_clean_simper.csv“.
  • -
-

R_krusk.R

-

This script takes the output .csv of simper_pretty.R, and the OTU/metadata/taxonomy tables, and performs the non-parametric Kruskal-Wallis rank-sum test on each OTU in the .csv file. Output is a .csv file containing the same contents of simper.pretty output with the following info: p-value, fdr corrected p-value, OTU taxonomic classification (if applicable), mean rel. abund and std dev of otu/tax_lvl in group 1 of comparison, and mean rel. abund and std dev of otu/tax_lvl in group 2 of comparison. Requires installation of R packages ‘vegan’ and ‘dplyr’.

-

Usage:

-

kruskal.pretty(x, metrics, csv, c(‘interesting’), ‘output_name’, taxonomy)

-

Inputs:

-
    -
  • x: OTU table
  • -
  • metrics: metadata table
  • -
  • csv: output from simper.pretty, must be imported as data.frame. e.g. csv= data.frame(read.csv(“PATH to name_clean_simper.csv”))
  • -
  • interesting: a list of the column headers for the columns of interest in the metrics file, should be same as simper.pretty inputs. e.g. c(‘int1’,‘int2’,‘int3’)
  • -
  • output_name= the name that is appended to the output filename “_krusk_simper.csv“.
  • -
  • taxonomy: The .taxonomy file output from classify.otu command in mothur. This is the UNALTERED tax file, not tax.clean (optional)
  • -
-

First, we load these functions into R.

-
source("Steinberger_scripts/simper_pretty.r")
-source("Steinberger_scripts/R_krusk.r")
-

Then, we apply them to our data. We will ask for all SIMPER OTUs (perc_cutoff = 1, meaning up to cumulative 100%) but cutoff any OTUs that individually contribute less than 1% to SIMPER (low_val=0.01). You may want to consider different cutoffs for your data.

-
simper.pretty(OTU.clean, meta, c('AgeGroup'), perc_cutoff=1, low_cutoff = 'y', low_val=0.01, 'Age')
-
-simper.results = data.frame(read.csv("Age_clean_simper.csv"))
-kruskal.pretty(OTU.clean, meta, simper.results, c('AgeGroup'), 'Age', tax)
-

If we import the Kruskal-Wallis back into R and select only OTUs there were significantly different after fdr correction (fdr_krusk_p.val)…

-
#Import
-KW.results = data.frame(read.csv("Age_krusk_simper.csv"))
-#Remove non-significant
-KW.results.signif = KW.results[KW.results$fdr_krusk_p.val < 0.05,]
-#Order by OTU#
-KW.results.signif = KW.results.signif[with(KW.results.signif, order(OTU)),]
-head(KW.results.signif)
-
##     X Comparison     SIMPER      OTU  krusk_p.val fdr_krusk_p.val
-## 2   2     1yr_2w 0.06434298 Otu00001 0.0004510953     0.001383359
-## 15 15     1yr_8w 0.03765603 Otu00001 0.0004510953     0.001383359
-## 1   1     1yr_2w 0.09837610 Otu00002 0.0004510953     0.001383359
-## 30 30      2w_8w 0.11013903 Otu00002 0.0208625823     0.029989962
-## 3   3     1yr_2w 0.05981442 Otu00003 0.0003310658     0.001383359
-## 32 32      2w_8w 0.06626526 Otu00003 0.0356919001     0.044373714
-##                                                                                                                   Taxonomy
-## 2          k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 15         k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 1          k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 30         k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii;
-## 3  k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;
-## 32 k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens;
-##    Left.mean.abund   Left.stdev Right.mean.abund Right.stdev
-## 2     7.109140e-06 2.010768e-05      0.128370197  0.16351829
-## 15    7.109140e-06 2.010768e-05      0.073292635  0.09803742
-## 1     7.118451e-06 2.013402e-05      0.196185324  0.23796423
-## 30    1.961853e-01 2.379642e-01      0.007205221  0.01601067
-## 3     0.000000e+00 0.000000e+00      0.119333403  0.18000346
-## 32    1.193334e-01 1.800035e-01      0.010598818  0.02126522
-

we see a number of OTU that significantly differ by age group.

-

Looking at OTU1 as relative abundance

-
#Calculate abundance
-abund = OTU.clean/rowSums(OTU.clean)*100
-#plot
-boxplot(abund$Otu00001 ~ meta$AgeGroup.ord, ylab="% Relative abundance", main="OTU1")
-

-

and using the P-values in KW.results.signif, we can say that OTU1 is significantly less abundant in 1yr animals compared to either 2w or 8w calves.

-
-
-

Continuous variables

-

For continuous variables, there is no simple test like SIMPER to pull out OTUs likely to differ across your variable. You could run linear models glm of the OTU abundances with different distributions family= similar to what we did with Chao richness. However, OTU abundance data is not normal nor does it fit well with other standard distributions due to its many zeros. So, you will need to test a number of distributions and transformations of the data to find a suitable model.

-
-
-

Correlations

-

So, you can also approach continuous variables as correlations. Generally, only strong correlations (r > 0.5 or r < -0.5) should be reported and if you have a lot that fall into the “strong” category, you can up the cut off, say, to r > 0.75 or r < -0.75. There are many correlation options. I like Kendall-Tau because it does not assume linearity or normality. Type ??cor in the R console to learn others that are available.

-

Also, consider options to decrease the number of OTUs tested or you will be dealing with a huge table. Like only ones at >X% abundance? Only ones found in SIMPER and/or KW analyses of other important variables?

-

Here, we will correlate ADG to OTUs with at least 5% relative abundance in at least one sample in our data set.

-
#Remember we calculated abundance before with
-#abund = OTU.clean/rowSums(OTU.clean)*100
-
-#Subset OTUs to abundance cutoff
-OTU.abund = OTU.clean[, apply(abund, MARGIN=2, function(x) any(x > 5))]
-
-cor.kendall = cor(OTU.abund, meta$ADGKG, method = "kendall")
-cor.kendall
-
##                  [,1]
-## Otu00001  0.189852125
-## Otu00002  0.211764129
-## Otu00003  0.027397313
-## Otu00004  0.275867615
-## Otu00005  0.165056323
-## Otu00006 -0.114462240
-## Otu00007  0.143930930
-## Otu00008  0.211764129
-## Otu00009 -0.177517901
-## Otu00010  0.176299258
-## Otu00011  0.208334326
-## Otu00012  0.017236256
-## Otu00013  0.269669049
-## Otu00015  0.018077538
-## Otu00016 -0.257293680
-## Otu00017  0.284293111
-## Otu00019  0.172479145
-## Otu00020  0.102188122
-## Otu00022 -0.034040152
-## Otu00023  0.004106646
-## Otu00024  0.073416202
-## Otu00027  0.412640807
-## Otu00029  0.076924424
-## Otu00030 -0.077670805
-## Otu00031  0.286002668
-## Otu00038 -0.271163072
-## Otu00041  0.125193349
-## Otu00043  0.189645652
-## Otu00044  0.239065695
-## Otu00053 -0.217652255
-## Otu00055 -0.112428004
-## Otu00070 -0.037317590
-

In this case, we don’t see any strong correlations. However, if we did, we could use those OTUs as our list of ones that are of interest to check for significance with glm.

-

Next, we will correlate SCFAs with OTUs with at least 1% relative abundance in at least one sample in our data set. We will use only samples for which we also have SCFA data.

-
#Calculate abundances
-abund.SCFA = OTU.SCFA/rowSums(OTU.SCFA)*100
-
-#Subset OTUs to abundance cutoff
-OTU.SCFA.abund = OTU.SCFA[, apply(abund.SCFA, MARGIN=2, function(x) any(x > 1))]
-
-cor.kendall = cor(OTU.SCFA.abund, SCFA, method = "kendall")
-cor.kendall
-
##             Formate    Acetate Propionate Isobutyrate   Butyrate
-## Otu00006  0.0000000  0.1825742  0.1825742   0.1825742  0.1825742
-## Otu00014  0.1825742  0.3333333  0.3333333   0.0000000  0.3333333
-## Otu00016 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00018 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00021 -0.9128709 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00025  0.9128709  0.6666667  0.6666667   0.3333333  0.6666667
-## Otu00035 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00036 -0.5477226 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00037 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00040 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00042  0.1825742  0.3333333  0.3333333   0.0000000  0.3333333
-## Otu00046 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00049 -0.1825742 -0.3333333 -0.3333333   0.0000000 -0.3333333
-## Otu00051  0.5477226  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00052 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00056 -0.1825742 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00064 -0.5477226 -0.3333333 -0.3333333  -0.6666667 -0.3333333
-## Otu00066 -0.5477226 -0.6666667 -0.6666667  -1.0000000 -0.6666667
-## Otu00067  0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00069  0.5477226  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00074  0.5477226  0.6666667  0.6666667   0.3333333  0.6666667
-## Otu00077  0.1825742  0.3333333  0.3333333   0.6666667  0.3333333
-## Otu00088  0.1825742  0.0000000  0.0000000  -0.3333333  0.0000000
-## Otu00089  0.1825742  0.0000000  0.0000000  -0.3333333  0.0000000
-## Otu00097 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00100 -0.1825742  0.0000000  0.0000000   0.3333333  0.0000000
-## Otu00113 -0.5477226 -0.6666667 -0.6666667  -0.3333333 -0.6666667
-## Otu00192  0.5477226  0.6666667  0.6666667   1.0000000  0.6666667
-## Otu00295  0.2581989  0.2357023  0.2357023   0.7071068  0.2357023
-##            iVal.2MB   Valerate
-## Otu00006 -0.1825742  0.1825742
-## Otu00014 -0.3333333  0.0000000
-## Otu00016 -0.3333333 -0.6666667
-## Otu00018 -0.3333333 -0.6666667
-## Otu00021 -0.6666667 -0.3333333
-## Otu00025  0.6666667  0.3333333
-## Otu00035 -0.6666667 -1.0000000
-## Otu00036  0.0000000 -0.3333333
-## Otu00037  0.0000000  0.3333333
-## Otu00040 -0.6666667 -1.0000000
-## Otu00042 -0.3333333  0.0000000
-## Otu00046 -0.3333333 -0.6666667
-## Otu00049  0.3333333  0.0000000
-## Otu00051  1.0000000  0.6666667
-## Otu00052 -0.6666667 -1.0000000
-## Otu00056 -0.3333333 -0.6666667
-## Otu00064 -1.0000000 -0.6666667
-## Otu00066 -0.6666667 -1.0000000
-## Otu00067  0.6666667  0.3333333
-## Otu00069  1.0000000  0.6666667
-## Otu00074  0.0000000  0.3333333
-## Otu00077  0.3333333  0.6666667
-## Otu00088  0.0000000 -0.3333333
-## Otu00089  0.0000000 -0.3333333
-## Otu00097  0.0000000  0.3333333
-## Otu00100  0.0000000  0.3333333
-## Otu00113  0.0000000 -0.3333333
-## Otu00192  0.6666667  1.0000000
-## Otu00295  0.7071068  0.7071068
-

If the data table is too large to view in R, you can write it to a table in your project folder.

-
write.table(cor.kendall, file = "cor_kendall.csv", sep = ",")
-

We see that some OTUs strongly correlation with a SCFAs. For example, Otu00021 and Otu00025 with Formate

-
par(mfrow = c(1, 2))
-plot(abund.SCFA$Otu00021 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU21")
-plot(abund.SCFA$Otu00025 ~ SCFA$Formate, xlab="Formate (mM)", ylab="Relative abundance, %", main="OTU25")
-

-

Clearly we don’t have enough data points to make strong conclusions here and the correlations are being driven by one animal with very high formate. However, we could further test the list of OTUs that correlate strongly with SCFAs. We will assume a normal distribution here, but you should assess your models with plot() to make sure they are a good fit.

-
OTU21.Formate = glm(OTU.SCFA$Otu00021 ~ SCFA$Formate)
-summary(OTU21.Formate)
-
## 
-## Call:
-## glm(formula = OTU.SCFA$Otu00021 ~ SCFA$Formate)
-## 
-## Deviance Residuals: 
-##       1        2        3        4  
-## -56.173   96.253  -46.747    6.668  
-## 
-## Coefficients:
-##              Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)    357.75      51.46   6.952   0.0201 *
-## SCFA$Formate  -540.02     201.13  -2.685   0.1152  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 7324.907)
-## 
-##     Null deviance: 67454  on 3  degrees of freedom
-## Residual deviance: 14650  on 2  degrees of freedom
-## AIC: 50.175
-## 
-## Number of Fisher Scoring iterations: 2
-
OTU25.Formate = glm(OTU.SCFA$Otu00025 ~ SCFA$Formate)
-summary(OTU25.Formate)
-
## 
-## Call:
-## glm(formula = OTU.SCFA$Otu00025 ~ SCFA$Formate)
-## 
-## Deviance Residuals: 
-##        1         2         3         4  
-##  127.727  -118.783     6.217   -15.162  
-## 
-## Coefficients:
-##              Estimate Std. Error t value Pr(>|t|)  
-## (Intercept)    219.78      74.49   2.951   0.0982 .
-## SCFA$Formate   721.00     291.12   2.477   0.1316  
-## ---
-## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-## 
-## (Dispersion parameter for gaussian family taken to be 15346.04)
-## 
-##     Null deviance: 124819  on 3  degrees of freedom
-## Residual deviance:  30692  on 2  degrees of freedom
-## AIC: 53.133
-## 
-## Number of Fisher Scoring iterations: 2
-

So, we see that these two OTUs do not significantly differ with Formate concentration even though they had very strong Kendall correlations. This is similar to OTUs occuring in SIMPER that do not hold up to subsequent Kruskal-Wallis testing.

-
-
-
-

Other visualizations

-
-

Bar charts

-

The phyloseq object we created with our OTU, meta, tax, and tree data (physeq.tree) can also be used in a number of other plot functions in the phyloseq / ggplot2 packages.

-

Let’s explore some of the bar chart options. First, we’ll make the classic additive bar chart for phyla in our samples

-
plot_bar(physeq.tree, fill="Phylum")
-

-

We can simplify by grouping our samples by age group

-
plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") 
-

-

And removing the lines between OTUs in the bars

-
plot_bar(physeq.tree, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

And only showing the top 5 most abundant phyla

-
#Sort the Phyla by abundance and pick the top 5
-top5P.names = sort(tapply(taxa_sums(physeq.tree), tax_table(physeq.tree)[, "Phylum"], sum), TRUE)[1:5]
-#Cut down the physeq.tree data to only the top 10 Phyla
-top5P = subset_taxa(physeq.tree, Phylum %in% names(top5P.names))
-#Plot
-plot_bar(top5P, x="AgeGroup", fill="Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

There are many more options within ggplot2 to alter this figure. This document has many helpful tips.

-

Another way to simplify these bar plots is to not show all OTUs for one sample in one bar. We can do this with facet_grid

-
plot_bar(top5P, x="AgeGroup", fill="Phylum", facet_grid = ~Phylum) + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
-

-

And you can break it down at any taxonomic level and color by any other level.

-
-
-

Trees

-

We can also plot phylogenetic trees and label/modify them by our variables of interest.

-

Let’s look at the genus Prevotella in our data. We want to subset down to just this genus or else our plot would be too cluttered to read.

-

Subset by genus

-
prevotella = subset_taxa(physeq.tree, Genus == "g__Prevotella")
-

We can see that this worked by comparing the number of taxa in our subset and our original data

-
physeq.tree
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 5002 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 5002 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 5002 tips and 5000 internal nodes ]
-
prevotella
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 106 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 106 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 106 tips and 105 internal nodes ]
-

We can plot these OTUs on a tree.

-
plot_tree(prevotella, plot.margin = 0.5, ladderize = TRUE)
-

-

In the figure, each OTU is represented by the end branch of the tree. How many samples that OTU occurs in is represented by the black dots.

-

Let’s make this figure a little more useful and add 1) Colors to the dots for our age groups, 2) Size to the dots to show OTU abundance, and 3) Species level labels for the OTUs

-
plot_tree(prevotella, color = "AgeGroup", label.tips = "Species", size = "abundance", plot.margin = 0.5, ladderize = TRUE)
-

-

Already it’s a little difficult to read. You can view a larger page by clicking “Zoom” above the figure. Or export the figure as a PDF and save as a full page size, 9.5x11.

-

There are even more customizable options in this figure. Type ?plot_tree into the console to see the help page explaining all the options.

-
-
-

Heat maps

-

There are some good options in both phyloseq and gplots to make heatmaps. We will go through phyloseq but know that the same things could be done in gplots with code specific to that package.

-
-

OTU abundance

-

We’re going to just look at the 20 most abundant OTUs to make it more readable.

-
#Sort the OTUs by abundance and pick the top 20
-top20OTU.names = names(sort(taxa_sums(physeq.tree), TRUE)[1:20])
-#Cut down the physeq.tree data to only the top 10 Phyla
-top20OTU = prune_taxa(top20OTU.names, physeq.tree)
-

We now see that we only have 20 taxa

-
top20OTU
-
## phyloseq-class experiment-level object
-## otu_table()   OTU Table:         [ 20 taxa and 24 samples ]
-## sample_data() Sample Data:       [ 24 samples by 9 sample variables ]
-## tax_table()   Taxonomy Table:    [ 20 taxa by 7 taxonomic ranks ]
-## phy_tree()    Phylogenetic Tree: [ 20 tips and 19 internal nodes ]
-

First, you can make a heatmap of OTU abundance across all samples

-
plot_heatmap(top20OTU)
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

And grouped by our age groups

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

We can label the OTU taxa

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

And group OTUs within the same Phyla

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

We can also change the colors (white -> purple), including the 0s/NAs (grey).

-
plot_heatmap(top20OTU, sample.label="AgeGroup", sample.order="AgeGroup", taxa.label="Genus", taxa.order="Phylum", low="white", high="purple", na.value="grey")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-

You can also have R automatically group your OTUs and samples by beta-diversity. This may yield the most easily interpreted heatmap but if you have a specific research question that is better addressed by your own ordering (like our age groups above), you should stick with that. We’ll show Bray-Curtis as an example. Other options are

-
    -
  • bray
  • -
  • jaccard
  • -
  • wunifrac
  • -
  • uwunifrac
  • -
-
plot_heatmap(top20OTU, "NMDS", "bray", title="Bray-Curtis")
-
## Warning: Transformation introduced infinite values in discrete y-axis
-

-
-
-

Beta-diversity

-

The other common use for heatmaps is to show distances between samples (i.e. beta-diversity) similar to what is shown in nMDS. We have all of the same metric options as we did for nMDS.

-

We do not want to use the plot_heatmap() function from phyloseq because it requires the input of a physeq object. Instead, we can use our distance matrices as inputs for a gplots command. This command will automatically group samples by similarity (trees)

-
#Bray-Curtis
-heatmap.2(as.matrix(BC.dist))
-

-
#UniFrac
-heatmap.2(as.matrix(wUF.dist))
-

-

You could also change the colors

-
#Rainbow colors
-rc <- rainbow(nrow(as.matrix(BC.dist)), start=0, end=0.9)
-heatmap.2(as.matrix(BC.dist), col=rc)
-

-

As always, for further customization, explore with ?heatmap.2

-
-
-
-

Venn diagrams

-

Venn diagram of three samples: 5017.2w.F, 5017.8w.F, and 5017.1yr.F

-

Create a list of OTUs that occur (count > 0) in each sample.

-
    -
  • We select for the row by name with OTU.clean[“name”,]
  • -
  • We select the columns with a value >0 with OTU.clean[,apply()]
  • -
-
OTU.5017.2w = colnames(OTU.clean["5017.2w.F", apply(OTU.clean["5017.2w.F",], MARGIN=2, function(x) any(x >0))])
-
-OTU.5017.8w = colnames(OTU.clean["5017.8w.F", apply(OTU.clean["5017.8w.F",], MARGIN=2, function(x) any(x >0))])
-
-OTU.5017.1yr = colnames(OTU.clean["5017.1yr.F",apply(OTU.clean["5017.1yr.F",], MARGIN=2, function(x) any(x >0))])
-

We can then use these lists of OTUs to plot a Venn diagram with venn() from the gplots package

-
venn(list(OTU.5017.2w, OTU.5017.8w, OTU.5017.1yr))
-

-

We can also do this for our age groups by selecting all samples where meta$AgeGroup = 2w, 8w, or 1yr

-
OTU.2w = colnames(OTU.clean[meta$AgeGroup == "2w", apply(OTU.clean[meta$AgeGroup == "2w",], MARGIN=2, function(x) any(x >0))])
-
-OTU.8w = colnames(OTU.clean[meta$AgeGroup == "8w", apply(OTU.clean[meta$AgeGroup == "8w",], MARGIN=2, function(x) any(x >0))])
-
-OTU.1yr = colnames(OTU.clean[meta$AgeGroup == "1yr", apply(OTU.clean[meta$AgeGroup == "1yr",], MARGIN=2, function(x) any(x >0))])
-

And plot

-
venn(list(OTU.2w, OTU.8w, OTU.1yr))
-

-

These are not the prettiest Venns, but they are the quickest way to calculate the values within a Venn.

-

Once you have these, you can use the VennDiagram package for more pretty graphing options. For example, the age groups venns would be

-
draw.triple.venn(area1 = 385+58+71+320, area2 = 801+190+320+71, area3 = 3177+190+58+71, n12 = 320+71, n23 = 190+71, n13 = 58+71, n123 = 71, category = c("2w", "8w", "1yr"), lty = "blank", fill = c("green", "red", "blue"))
-

-
## (polygon[GRID.polygon.1343], polygon[GRID.polygon.1344], polygon[GRID.polygon.1345], polygon[GRID.polygon.1346], polygon[GRID.polygon.1347], polygon[GRID.polygon.1348], text[GRID.text.1349], text[GRID.text.1350], text[GRID.text.1351], text[GRID.text.1352], text[GRID.text.1353], text[GRID.text.1354], text[GRID.text.1355], text[GRID.text.1356], text[GRID.text.1357], text[GRID.text.1358])
-

Or with venneuler, you can scale the circles to be porportional to the total number of OTUs in that group

-
#Create a venneuler object
-age.venn=venneuler(c('A' = 385+58+71+320, 'B' = 801+190+320+71, 'C' = 3177+190+58+71, 'A&B' = 320+71, 'B&C' = 190+71, 'A&C' = 58+71, 'A&B&C' = 71))
-
-#Add group names
-age.venn$labels = c("2w", "8w", "1yr")
-
-#Plot
-plot(age.venn)
-

-
-
-
-

Publication figures

-

Once you have a figure you want to include in a publication, there are a number of ways to export it out of R. You can use the “Export” function within the Plots window, but this often does not result in high enough resolution.

-

Here, we will use postscript to export at a specific resolution, size and font. This function uses

-
    -
  • width, height: in inches
  • -
  • horizontal: TRUE = landscape, FALSE = portrait
  • -
  • colormodel: RGB, CMYK, and others
  • -
  • family: Font to be used within figures
  • -
-

Then we add layout if we have more than one plot within the overall figure.

-
    -
  • matrix: -
      -
    • A list of how many figures there are. For 2, it is c(1,2). For 4, it is c(1,2,3,4)
    • -
    • Then the number of rows, columns the figures should be oriented in
    • -
  • -
  • widths: A list of scalars of how large each figure should be in width.
  • -
  • heights: A list of scalars of how large each figure should be in heigth.
  • -
-
postscript("Fig1.png", width = 6, height = 3, horizontal = FALSE, colormodel = "rgb", family = "ArialMT")
-
-layout(matrix(c(1,2), 1, 2), widths=c(3,2), heights=c(1,1))
-
-plot(BC.nmds, type="n", main="Bray-Curtis")
-    points(BC.nmds, display="sites", pch=20, col=c("blue", "green", "red")[meta$AgeGroup])
-
-boxplot(shannon ~ AgeGroup.ord, data=meta, main="Diversity", ylab="Shannon's diversity", col=c("green", "red", "blue"))
-
-dev.off()
-
## png 
-##   2
-
- - -
-
- - - -
-
- -
- - - - - - - - diff --git a/Microbiota_analysis_R/cor_kendall.csv b/Microbiota_analysis_R/cor_kendall.csv new file mode 100644 index 0000000..7bdbe25 --- /dev/null +++ b/Microbiota_analysis_R/cor_kendall.csv @@ -0,0 +1,30 @@ +"Formate","Acetate","Propionate","Isobutyrate","Butyrate","iVal.2MB","Valerate" +"Otu00006",0,0.182574185835055,0.182574185835055,0.182574185835055,0.182574185835055,-0.182574185835055,0.182574185835055 +"Otu00014",0.182574185835055,0.333333333333333,0.333333333333333,0,0.333333333333333,-0.333333333333333,0 +"Otu00016",-0.182574185835055,-0.333333333333333,-0.333333333333333,-0.666666666666667,-0.333333333333333,-0.333333333333333,-0.666666666666667 +"Otu00018",-0.182574185835055,-0.333333333333333,-0.333333333333333,-0.666666666666667,-0.333333333333333,-0.333333333333333,-0.666666666666667 +"Otu00021",-0.912870929175277,-0.666666666666667,-0.666666666666667,-0.333333333333333,-0.666666666666667,-0.666666666666667,-0.333333333333333 +"Otu00025",0.912870929175277,0.666666666666667,0.666666666666667,0.333333333333333,0.666666666666667,0.666666666666667,0.333333333333333 +"Otu00035",-0.547722557505166,-0.666666666666667,-0.666666666666667,-1,-0.666666666666667,-0.666666666666667,-1 +"Otu00036",-0.547722557505166,-0.666666666666667,-0.666666666666667,-0.333333333333333,-0.666666666666667,0,-0.333333333333333 +"Otu00037",-0.182574185835055,0,0,0.333333333333333,0,0,0.333333333333333 +"Otu00040",-0.547722557505166,-0.666666666666667,-0.666666666666667,-1,-0.666666666666667,-0.666666666666667,-1 +"Otu00042",0.182574185835055,0.333333333333333,0.333333333333333,0,0.333333333333333,-0.333333333333333,0 +"Otu00046",-0.182574185835055,-0.333333333333333,-0.333333333333333,-0.666666666666667,-0.333333333333333,-0.333333333333333,-0.666666666666667 +"Otu00049",-0.182574185835055,-0.333333333333333,-0.333333333333333,0,-0.333333333333333,0.333333333333333,0 +"Otu00051",0.547722557505166,0.333333333333333,0.333333333333333,0.666666666666667,0.333333333333333,1,0.666666666666667 +"Otu00052",-0.547722557505166,-0.666666666666667,-0.666666666666667,-1,-0.666666666666667,-0.666666666666667,-1 +"Otu00056",-0.182574185835055,-0.333333333333333,-0.333333333333333,-0.666666666666667,-0.333333333333333,-0.333333333333333,-0.666666666666667 +"Otu00064",-0.547722557505166,-0.333333333333333,-0.333333333333333,-0.666666666666667,-0.333333333333333,-1,-0.666666666666667 +"Otu00066",-0.547722557505166,-0.666666666666667,-0.666666666666667,-1,-0.666666666666667,-0.666666666666667,-1 +"Otu00067",0.182574185835055,0,0,0.333333333333333,0,0.666666666666667,0.333333333333333 +"Otu00069",0.547722557505166,0.333333333333333,0.333333333333333,0.666666666666667,0.333333333333333,1,0.666666666666667 +"Otu00074",0.547722557505166,0.666666666666667,0.666666666666667,0.333333333333333,0.666666666666667,0,0.333333333333333 +"Otu00077",0.182574185835055,0.333333333333333,0.333333333333333,0.666666666666667,0.333333333333333,0.333333333333333,0.666666666666667 +"Otu00088",0.182574185835055,0,0,-0.333333333333333,0,0,-0.333333333333333 +"Otu00089",0.182574185835055,0,0,-0.333333333333333,0,0,-0.333333333333333 +"Otu00097",-0.182574185835055,0,0,0.333333333333333,0,0,0.333333333333333 +"Otu00100",-0.182574185835055,0,0,0.333333333333333,0,0,0.333333333333333 +"Otu00113",-0.547722557505166,-0.666666666666667,-0.666666666666667,-0.333333333333333,-0.666666666666667,0,-0.333333333333333 +"Otu00192",0.547722557505166,0.666666666666667,0.666666666666667,1,0.666666666666667,0.666666666666667,1 +"Otu00295",0.258198889747161,0.235702260395516,0.235702260395516,0.707106781186548,0.235702260395516,0.707106781186548,0.707106781186548 diff --git a/Microbiota_analysis_R/rsconnect/documents/Microbiota_Analysis_in_R.Rmd/rpubs.com/rpubs/Document.dcf b/Microbiota_analysis_R/rsconnect/documents/Microbiota_Analysis_in_R.Rmd/rpubs.com/rpubs/Document.dcf new file mode 100644 index 0000000..7e2d439 --- /dev/null +++ b/Microbiota_analysis_R/rsconnect/documents/Microbiota_Analysis_in_R.Rmd/rpubs.com/rpubs/Document.dcf @@ -0,0 +1,10 @@ +name: Document +title: +username: +account: rpubs +server: rpubs.com +hostUrl: rpubs.com +appId: https://api.rpubs.com/api/v1/document/343284/1506985df7234429bdc5e5130a2cd978 +bundleId: https://api.rpubs.com/api/v1/document/343284/1506985df7234429bdc5e5130a2cd978 +url: http://rpubs.com/publish/claim/343284/b25e6bbbd97642f2a70c89fca21fe807 +when: 1513750800.58265 diff --git a/Microbiota_analysis_R/rsconnect/documents/Microbiota_Analysis_in_R.Rmd/rpubs.com/rpubs/Publish Document.dcf b/Microbiota_analysis_R/rsconnect/documents/Microbiota_Analysis_in_R.Rmd/rpubs.com/rpubs/Publish Document.dcf new file mode 100644 index 0000000..2bbd438 --- /dev/null +++ b/Microbiota_analysis_R/rsconnect/documents/Microbiota_Analysis_in_R.Rmd/rpubs.com/rpubs/Publish Document.dcf @@ -0,0 +1,10 @@ +name: Publish Document +title: +username: +account: rpubs +server: rpubs.com +hostUrl: rpubs.com +appId: https://api.rpubs.com/api/v1/document/343284/1506985df7234429bdc5e5130a2cd978 +bundleId: https://api.rpubs.com/api/v1/document/343284/1506985df7234429bdc5e5130a2cd978 +url: http://rpubs.com/dillmcfarlan/343284 +when: 1513751252.05472