From 5ecc8ae0b649a629d3296cdcfc73a281e9073a34 Mon Sep 17 00:00:00 2001 From: nfrerebeau Date: Fri, 10 Jan 2025 17:50:22 +0100 Subject: [PATCH] Update vignette --- vignettes/bibliography.bib | 476 +++++++++++++++++++------------------ vignettes/groups.Rmd | 57 ++++- vignettes/nexus.Rmd | 4 +- 3 files changed, 292 insertions(+), 245 deletions(-) diff --git a/vignettes/bibliography.bib b/vignettes/bibliography.bib index 28c5400..1e242a0 100644 --- a/vignettes/bibliography.bib +++ b/vignettes/bibliography.bib @@ -10,6 +10,73 @@ @book{aitchison1986 pagetotal = {416} } +@inproceedings{aitchison1997, + title = {The {{One-Hour Course}} in {{Compositional Data Analysis}} or {{Compositional Data Analysis Is Simple}}}, + booktitle = {{{IAMG}}'97}, + author = {Aitchison, J.}, + editor = {Pawlowsky-Glahn, V.}, + date = {1997}, + pages = {3--35}, + publisher = {International Center for Numerical Methods in Engineering (CIMNE)}, + location = {Barcelona}, + eventtitle = {Third Annual Conference of the {{International Association}} for {{Mathematical Geology}}}, + langid = {english} +} + +@article{aitchison2002, + title = {Biplots of Compositional Data}, + author = {Aitchison, John and Greenacre, Michael}, + date = {2002}, + journaltitle = {Journal of the Royal Statistical Society: Series C (Applied Statistics)}, + shortjournal = {J Royal Statistical Soc C}, + volume = {51}, + number = {4}, + pages = {375--392}, + issn = {0035-9254, 1467-9876}, + doi = {10.1111/1467-9876.00275}, + langid = {english} +} + +@article{baxter2008, + title = {On {{Statistical Approaches}} to the {{Study}} of {{Ceramic Artefacts Using Geochemical}} and {{Petrographic Data}}}, + author = {Baxter, M. J. and Beardah, C. C. and Papageorgiou, I. and Cau, M. A. and Day, P. M. and Kilikoglou, V.}, + date = {2008}, + journaltitle = {Archaeometry}, + volume = {50}, + number = {1}, + pages = {142--157}, + issn = {0003813X}, + doi = {10.1111/j.1475-4754.2007.00359.x}, + abstract = {The scientific analysis of ceramics often has the aim of identifying groups of similar artefacts. Much published work focuses on analysis of data derived from geochemical or mineralogical techniques. The former is more likely to be subjected to quantitative statistical analysis. This paper examines some approaches to the statistical analysis of data arising from both kinds of techniques, including ‘mixed-mode’ methods where both types of data are incorporated into analysis. The approaches are illustrated using data derived from 88 Late Bronze Age transport jars from Kommos, Crete. Results suggest that the mixed-mode approach can provide additional insight into the data.}, + langid = {english} +} + +@inproceedings{beardah2003, + title = {"{{Mixed-mode}}" Approaches to the Grouping of Ceramic Artefacts Using {{S-Plus}}}, + booktitle = {The {{Digital Heritage}} of {{Archaeology}}.}, + author = {Beardah, C. C. and Baxter, M. J. and Papageorgiou, I. and Cau, M. A.}, + editor = {Doerr, M. and Sarris, A.}, + date = {2003}, + pages = {261--266}, + publisher = {{Archive of Monuments and Publications, Hellenic Ministry of Culture}}, + location = {Athens}, + eventtitle = {{{CAA2002}} ({{Heraklion}}, {{Crete}}; {{April}} 2002)}, + langid = {english} +} + +@article{cau2004, + title = {Exploring Automatic Grouping Procedures in Ceramic Petrology}, + author = {Cau, Miguel-Angel and Day, Peter M and Baxter, Michael J and Papageorgiou, Ioulia and Iliopoulos, Ioannis and Montana, Giuseppe}, + date = {2004}, + journaltitle = {Journal of Archaeological Science}, + volume = {31}, + number = {9}, + pages = {1325--1338}, + issn = {03054403}, + doi = {10.1016/j.jas.2004.03.006}, + langid = {english} +} + @article{egozcue2003, title = {Isometric {{Logratio Transformations}} for {{Compositional Data Analysis}}}, author = {Egozcue, J. J. and Pawlowsky-Glahn, V. and Mateu-Figueras, G. and Barceló-Vidal, C.}, @@ -23,6 +90,32 @@ @article{egozcue2003 langid = {english} } +@article{egozcue2023, + title = {Subcompositional Coherence and and a Novel Proportionality Index of Parts}, + author = {Egozcue, Juan José and Pawlowsky-Glahn, Vera}, + date = {2023}, + journaltitle = {SORT}, + volume = {47}, + number = {2}, + pages = {229--244}, + doi = {10.57645/20.8080.02.7}, + abstract = {Research in compositional data analysis was motivated by spurious (Pearson) correlation. Spurious results are due to semantic incoherence, but the question of ways to relate parts in a statistically consistent way remains open. To solve this problem we frst defne a coherent system of functions with respect to a subcomposition and analyze the space of parts. This leads to understanding why measures like covariance and correlation depend on the subcomposition considered, while measures like the distance between parts are independent of the same. It allows the defnition of a novel index of proportionality between parts.}, + langid = {english} +} + +@article{egozcue2024, + title = {Exploring Geochemical Data Using Compositional Techniques: {{A}} Practical Guide}, + shorttitle = {Exploring Geochemical Data Using Compositional Techniques}, + author = {Egozcue, Juan José and Gozzi, Caterina and Buccianti, Antonella and Pawlowsky-Glahn, Vera}, + date = {2024-03}, + journaltitle = {Journal of Geochemical Exploration}, + volume = {258}, + pages = {107385}, + issn = {03756742}, + doi = {10.1016/j.gexplo.2024.107385}, + langid = {english} +} + @article{filzmoser2005, title = {Multivariate Outlier Detection in Exploration Geochemistry}, author = {Filzmoser, Peter and Garrett, Robert G. and Reimann, Clemens}, @@ -50,6 +143,51 @@ @article{filzmoser2008 langid = {english} } +@article{filzmoser2009, + title = {Principal Component Analysis for Compositional Data with Outliers}, + author = {Filzmoser, Peter and Hron, Karel and Reimann, Clemens}, + date = {2009}, + journaltitle = {Environmetrics}, + volume = {20}, + number = {6}, + pages = {621--632}, + issn = {11804009, 1099095X}, + doi = {10.1002/env.966}, + abstract = {Compositional data (almost all data in geochemistry) are closed data, that is they usually sum up to a constant (e.g. weight percent, wt.\%) and carry only relative information. Thus, the covariance structure of compositional data is strongly biased and results of many multivariate techniques become doubtful without a proper transformation of the data. The centred logratio transformation (clr) is often used to open closed data. However the transformed data do not have full rank following a logratio transformation and cannot be used for robust multivariate techniques like principal component analysis (PCA). Here we propose to use the isometric logratio transformation (ilr) instead. However, the ilr transformation has the disadvantage that the resulting new variables are no longer directly interpretable in terms of the originally entered variables. Here we propose a technique how the resulting scores and loadings of a robust PCA on ilr transformed data can be back-transformed and interpreted. The procedure is demonstrated using a real data set from regional geochemistry and compared to results from non-transformed and non-robust versions of PCA. It turns out that the procedure using ilr-transformed data and robust PCA delivers superior results to all other approaches. The examples demonstrate that due to the compositional nature of geochemical data PCA should not be carried out without an appropriate transformation. Furthermore a robust approach is preferable if the dataset contains outliers.}, + langid = {english} +} + +@article{filzmoser2009a, + title = {Univariate Statistical Analysis of Environmental (Compositional) Data: {{Problems}} and Possibilities}, + shorttitle = {Univariate Statistical Analysis of Environmental (Compositional) Data}, + author = {Filzmoser, Peter and Hron, Karel and Reimann, Clemens}, + date = {2009}, + journaltitle = {Science of The Total Environment}, + volume = {407}, + number = {23}, + pages = {6100--6108}, + issn = {00489697}, + doi = {10.1016/j.scitotenv.2009.08.008}, + abstract = {For almost 30~years it has been known that compositional (closed) data have special geometrical properties. In environmental sciences, where the concentration of chemical elements in different sample materials is investigated, almost all datasets are compositional. In general, compositional data are parts of a whole which only give relative information. Data that sum up to a constant, e.g. 100~wt.\%, 1,000,000~mg/kg are the best known example. It is widely neglected that the “closure” characteristic remains even if only one of all possible elements is measured, it is an inherent property of compositional data. No variable is free to vary independent of all the others. Existing transformations to “open” closed data are seldom applied. They are more complicated than a log transformation and the relationship to the original data unit is lost. Results obtained when using classical statistical techniques for data analysis appeared reasonable and the possible consequences of working with closed data were rarely questioned. Here the simple univariate case of data analysis is investigated. It can be demonstrated that data closure must be overcome prior to calculating even simple statistical measures like mean or standard deviation or plotting graphs of the data distribution, e.g. a histogram. Some measures like the standard deviation (or the variance) make no statistical sense with closed data and all statistical tests building on the standard deviation (or variance) will thus provide erroneous results if used with the original data.}, + langid = {english} +} + +@article{filzmoser2010, + title = {The Bivariate Statistical Analysis of Environmental (Compositional) Data}, + author = {Filzmoser, Peter and Hron, Karel and Reimann, Clemens}, + date = {2010-09-01}, + journaltitle = {Science of The Total Environment}, + volume = {408}, + number = {19}, + pages = {4230--4238}, + issn = {00489697}, + doi = {10.1016/j.scitotenv.2010.05.011}, + url = {http://linkinghub.elsevier.com/retrieve/pii/S0048969710004845}, + urldate = {2017-08-27}, + abstract = {Environmental sciences usually deal with compositional (closed) data. Whenever the concentration of chemical elements is measured, the data will be closed, i.e. the relevant information is contained in the ratios between the variables rather than in the data values reported for the variables. Data closure has severe consequences for statistical data analysis. Most classical statistical methods are based on the usual Euclidean geometry — compositional data, however, do not plot into Euclidean space because they have their own geometry which is not linear but curved in the Euclidean sense. This has severe consequences for bivariate statistical analysis: correlation coefficients computed in the traditional way are likely to be misleading, and the information contained in scatterplots must be used and interpreted differently from sets of non-compositional data. As a solution, the ilr transformation applied to a variable pair can be used to display the relationship and to compute a measure of stability. This paper discusses how this measure is related to the usual correlation coefficient and how it can be used and interpreted. Moreover, recommendations are provided for how the scatterplot can still be used, and which alternatives exist for displaying the relationship between two variables.}, + langid = {english} +} + @article{filzmoser2012, title = {Interpretation of Multivariate Outliers for Compositional Data}, author = {Filzmoser, Peter and Hron, Karel and Reimann, Clemens}, @@ -89,19 +227,6 @@ @article{fiserova2011 langid = {english} } -@book{vandenboogaart2013, - title = {Analyzing {{Compositional Data}} with {{R}}}, - author = {family=Boogaart, given=K. Gerald, prefix=van den, useprefix=true and Tolosana-Delgado, Raimon}, - date = {2013}, - series = {Use {{R}}!}, - publisher = {Springer-Verlag}, - location = {Berlin Heidelberg}, - doi = {10.1007/978-3-642-36809-7}, - isbn = {978-3-642-36808-0}, - langid = {english}, - pagetotal = {285} -} - @book{greenacre2019, title = {Compositional Data Analysis in Practice}, author = {Greenacre, Michael J.}, @@ -113,113 +238,45 @@ @book{greenacre2019 pagetotal = {121} } -@article{hron2017, - title = {Weighted {{Pivot Coordinates}} for {{Compositional Data}} and {{Their Application}} to {{Geochemical Mapping}}}, - author = {Hron, Karel and Filzmoser, Peter and family=Caritat, given=Patrice, prefix=de, useprefix=true and Fišerová, Eva and Gardlo, Alžběta}, - date = {2017}, - journaltitle = {Mathematical Geosciences}, - shortjournal = {Math Geosci}, - volume = {49}, - number = {6}, - pages = {797--814}, - issn = {1874-8961, 1874-8953}, - doi = {10.1007/s11004-017-9684-z}, - langid = {english} -} - -@article{aitchison2002, - title = {Biplots of Compositional Data}, - author = {Aitchison, John and Greenacre, Michael}, - date = {2002}, - journaltitle = {Journal of the Royal Statistical Society: Series C (Applied Statistics)}, - shortjournal = {J Royal Statistical Soc C}, - volume = {51}, - number = {4}, - pages = {375--392}, - issn = {0035-9254, 1467-9876}, - doi = {10.1111/1467-9876.00275}, - langid = {english} -} - -@article{rousseeuw1990, - title = {Unmasking {{Multivariate Outliers}} and {{Leverage Points}}}, - author = {Rousseeuw, Peter J. and family=Zomeren, given=Bert C., prefix=van, useprefix=true}, - date = {1990}, - journaltitle = {Journal of the American Statistical Association}, - shortjournal = {Journal of the American Statistical Association}, - volume = {85}, - number = {411}, - pages = {633--639}, - issn = {0162-1459, 1537-274X}, - doi = {10.1080/01621459.1990.10474920}, - langid = {english} -} - -@incollection{weigand1977, - title = {Turquoise {{Sources}} and {{Source Analysisis}}: {{Mesoamerica}} and the {{Southwestern U}}.{{S}}.{{A}}.}, - shorttitle = {Turquoise {{Sources}} and {{Source Analysisis}}}, - booktitle = {Exchange {{Systems}} in {{Prehistory}}}, - author = {Weigand, P. C. and Harbottle, G. and Sayre, E.}, - editor = {Ericson, J. and Earle, T. K.}, - date = {1977}, - pages = {15--34}, - publisher = {Academic Press}, - location = {New York, NY}, - langid = {english} -} - -@article{filzmoser2009, - title = {Principal Component Analysis for Compositional Data with Outliers}, - author = {Filzmoser, Peter and Hron, Karel and Reimann, Clemens}, - date = {2009}, - journaltitle = {Environmetrics}, - volume = {20}, - number = {6}, - pages = {621--632}, - issn = {11804009, 1099095X}, - doi = {10.1002/env.966}, - abstract = {Compositional data (almost all data in geochemistry) are closed data, that is they usually sum up to a constant (e.g. weight percent, wt.\%) and carry only relative information. Thus, the covariance structure of compositional data is strongly biased and results of many multivariate techniques become doubtful without a proper transformation of the data. The centred logratio transformation (clr) is often used to open closed data. However the transformed data do not have full rank following a logratio transformation and cannot be used for robust multivariate techniques like principal component analysis (PCA). Here we propose to use the isometric logratio transformation (ilr) instead. However, the ilr transformation has the disadvantage that the resulting new variables are no longer directly interpretable in terms of the originally entered variables. Here we propose a technique how the resulting scores and loadings of a robust PCA on ilr transformed data can be back-transformed and interpreted. The procedure is demonstrated using a real data set from regional geochemistry and compared to results from non-transformed and non-robust versions of PCA. It turns out that the procedure using ilr-transformed data and robust PCA delivers superior results to all other approaches. The examples demonstrate that due to the compositional nature of geochemical data PCA should not be carried out without an appropriate transformation. Furthermore a robust approach is preferable if the dataset contains outliers.}, - langid = {english} -} - -@article{pawlowsky-glahn2001, - title = {Geometric Approach to Statistical Analysis on the Simplex}, - author = {Pawlowsky-Glahn, V. and Egozcue, J. J.}, - date = {2001}, - journaltitle = {Stochastic Environmental Research and Risk Assessment}, - volume = {15}, - number = {5}, - pages = {384--398}, - issn = {14363240}, - doi = {10.1007/s004770100077}, - abstract = {The geometric interpretation of the expected value and the variance in real Euclidean space is used as a starting point to introduce metric counterparts on an arbitrary finite dimensional Hilbert space. This approach allows us to define general reasonable properties for estimators of parameters, like metric unbiasedness and minimum metric variance, resulting in a useful tool to better understand the logratio approach to the statistical analysis of compositional data, who's natural sample space is the simplex.}, +@article{greenacre2021, + title = {Compositional {{Data Analysis}}}, + author = {Greenacre, Michael}, + date = {2021}, + journaltitle = {Annual Review of Statistics and Its Application}, + shortjournal = {Annu. Rev. Stat. Appl.}, + volume = {8}, + number = {1}, + pages = {271--299}, + issn = {2326-8298, 2326-831X}, + doi = {10.1146/annurev-statistics-042720-124436}, + abstract = {Compositional data are nonnegative data carrying relative, rather than absolute, information—these are often data with a constant-sum constraint on the sample values, for example, proportions or percentages summing to 1\% or 100\%, respectively. Ratios between components of a composition are important since they are unaffected by the particular set of components chosen. Logarithms of ratios (logratios) are the fundamental transformation in the ratio approach to compositional data analysis—all data thus need to be strictly positive, so that zero values present a major problem. Components that group together based on domain knowledge can be amalgamated (i.e., summed) to create new components, and this can alleviate the problem of data zeros. Once compositional data are transformed to logratios, regular univariate and multivariate statistical analysis can be performed, such as dimension reduction and clustering, as well as modeling. Alternative methodologies that come close to the ideals of the logratio approach are also considered, especially those that avoid the problem of data zeros, which is particularly acute in large bioinformatic data sets.}, langid = {english} } -@article{santos2020, - title = {Modern Methods for Old Data: {{An}} Overview of Some Robust Methods for Outliers Detection with Applications in Osteology}, - shorttitle = {Modern Methods for Old Data}, - author = {Santos, Frédéric}, - date = {2020}, - journaltitle = {Journal of Archaeological Science: Reports}, - shortjournal = {Journal of Archaeological Science: Reports}, - volume = {32}, - pages = {102423}, - issn = {2352409X}, - doi = {10.1016/j.jasrep.2020.102423}, +@article{greenacre2024, + title = {A Comprehensive Workflow for Compositional Data Analysis in Archaeometry, with Code in {{R}}}, + author = {Greenacre, Michael and Wood, Jonathan R.}, + date = {2024-10}, + journaltitle = {Archaeological and Anthropological Sciences}, + shortjournal = {Archaeol Anthropol Sci}, + volume = {16}, + number = {10}, + pages = {171}, + issn = {1866-9557, 1866-9565}, + doi = {10.1007/s12520-024-02070-w}, langid = {english} } -@article{martin-fernandez2003, - title = {Dealing with {{Zeros}} and {{Missing Values}} in {{Compositional Data Sets Using Nonparametric Imputation}}}, - author = {Martín-Fernández, J. A. and Barceló-Vidal, C. and Pawlowsky-Glahn, V.}, - date = {2003}, - journaltitle = {Mathematical Geology}, - volume = {35}, - number = {3}, - pages = {253--278}, - issn = {08828121}, - doi = {10.1023/A:1023866030544}, +@article{grunsky2024, + title = {{{GeoCoDA}}: {{Recognizing}} and Validating Structural Processes in Geochemical Data. {{A}} Workflow on Compositional Data Analysis in Lithogeochemistry}, + shorttitle = {{{GeoCoDA}}}, + author = {Grunsky, Eric and Greenacre, Michael and Kjarsgaard, Bruce}, + date = {2024-06}, + journaltitle = {Applied Computing and Geosciences}, + volume = {22}, + pages = {100149}, + issn = {25901974}, + doi = {10.1016/j.acags.2023.100149}, langid = {english} } @@ -251,140 +308,97 @@ @article{hron2011 langid = {english} } -@article{greenacre2021, - title = {Compositional {{Data Analysis}}}, - author = {Greenacre, Michael}, - date = {2021}, - journaltitle = {Annual Review of Statistics and Its Application}, - shortjournal = {Annu. Rev. Stat. Appl.}, - volume = {8}, - number = {1}, - pages = {271--299}, - issn = {2326-8298, 2326-831X}, - doi = {10.1146/annurev-statistics-042720-124436}, - abstract = {Compositional data are nonnegative data carrying relative, rather than absolute, information—these are often data with a constant-sum constraint on the sample values, for example, proportions or percentages summing to 1\% or 100\%, respectively. Ratios between components of a composition are important since they are unaffected by the particular set of components chosen. Logarithms of ratios (logratios) are the fundamental transformation in the ratio approach to compositional data analysis—all data thus need to be strictly positive, so that zero values present a major problem. Components that group together based on domain knowledge can be amalgamated (i.e., summed) to create new components, and this can alleviate the problem of data zeros. Once compositional data are transformed to logratios, regular univariate and multivariate statistical analysis can be performed, such as dimension reduction and clustering, as well as modeling. Alternative methodologies that come close to the ideals of the logratio approach are also considered, especially those that avoid the problem of data zeros, which is particularly acute in large bioinformatic data sets.}, - langid = {english} -} - -@article{baxter2008, - title = {On {{Statistical Approaches}} to the {{Study}} of {{Ceramic Artefacts Using Geochemical}} and {{Petrographic Data}}}, - author = {Baxter, M. J. and Beardah, C. C. and Papageorgiou, I. and Cau, M. A. and Day, P. M. and Kilikoglou, V.}, - date = {2008}, - journaltitle = {Archaeometry}, - volume = {50}, - number = {1}, - pages = {142--157}, - issn = {0003813X}, - doi = {10.1111/j.1475-4754.2007.00359.x}, - abstract = {The scientific analysis of ceramics often has the aim of identifying groups of similar artefacts. Much published work focuses on analysis of data derived from geochemical or mineralogical techniques. The former is more likely to be subjected to quantitative statistical analysis. This paper examines some approaches to the statistical analysis of data arising from both kinds of techniques, including ‘mixed-mode’ methods where both types of data are incorporated into analysis. The approaches are illustrated using data derived from 88 Late Bronze Age transport jars from Kommos, Crete. Results suggest that the mixed-mode approach can provide additional insight into the data.}, - langid = {english} -} - -@article{cau2004, - title = {Exploring Automatic Grouping Procedures in Ceramic Petrology}, - author = {Cau, Miguel-Angel and Day, Peter M and Baxter, Michael J and Papageorgiou, Ioulia and Iliopoulos, Ioannis and Montana, Giuseppe}, - date = {2004}, - journaltitle = {Journal of Archaeological Science}, - volume = {31}, - number = {9}, - pages = {1325--1338}, - issn = {03054403}, - doi = {10.1016/j.jas.2004.03.006}, +@article{hron2017, + title = {Weighted {{Pivot Coordinates}} for {{Compositional Data}} and {{Their Application}} to {{Geochemical Mapping}}}, + author = {Hron, Karel and Filzmoser, Peter and family=Caritat, given=Patrice, prefix=de, useprefix=true and Fišerová, Eva and Gardlo, Alžběta}, + date = {2017}, + journaltitle = {Mathematical Geosciences}, + shortjournal = {Math Geosci}, + volume = {49}, + number = {6}, + pages = {797--814}, + issn = {1874-8961, 1874-8953}, + doi = {10.1007/s11004-017-9684-z}, langid = {english} } -@inproceedings{beardah2003, - title = {"{{Mixed-mode}}" Approaches to the Grouping of Ceramic Artefacts Using {{S-Plus}}}, - booktitle = {The {{Digital Heritage}} of {{Archaeology}}.}, - author = {Beardah, C. C. and Baxter, M. J. and Papageorgiou, I. and Cau, M. A.}, - editor = {Doerr, M. and Sarris, A.}, +@article{martin-fernandez2003, + title = {Dealing with {{Zeros}} and {{Missing Values}} in {{Compositional Data Sets Using Nonparametric Imputation}}}, + author = {Martín-Fernández, J. A. and Barceló-Vidal, C. and Pawlowsky-Glahn, V.}, date = {2003}, - pages = {261--266}, - publisher = {{Archive of Monuments and Publications, Hellenic Ministry of Culture}}, - location = {Athens}, - eventtitle = {{{CAA2002}} ({{Heraklion}}, {{Crete}}; {{April}} 2002)}, - langid = {english} -} - -@article{filzmoser2009a, - title = {Univariate Statistical Analysis of Environmental (Compositional) Data: {{Problems}} and Possibilities}, - shorttitle = {Univariate Statistical Analysis of Environmental (Compositional) Data}, - author = {Filzmoser, Peter and Hron, Karel and Reimann, Clemens}, - date = {2009}, - journaltitle = {Science of The Total Environment}, - volume = {407}, - number = {23}, - pages = {6100--6108}, - issn = {00489697}, - doi = {10.1016/j.scitotenv.2009.08.008}, - abstract = {For almost 30~years it has been known that compositional (closed) data have special geometrical properties. In environmental sciences, where the concentration of chemical elements in different sample materials is investigated, almost all datasets are compositional. In general, compositional data are parts of a whole which only give relative information. Data that sum up to a constant, e.g. 100~wt.\%, 1,000,000~mg/kg are the best known example. It is widely neglected that the “closure” characteristic remains even if only one of all possible elements is measured, it is an inherent property of compositional data. No variable is free to vary independent of all the others. Existing transformations to “open” closed data are seldom applied. They are more complicated than a log transformation and the relationship to the original data unit is lost. Results obtained when using classical statistical techniques for data analysis appeared reasonable and the possible consequences of working with closed data were rarely questioned. Here the simple univariate case of data analysis is investigated. It can be demonstrated that data closure must be overcome prior to calculating even simple statistical measures like mean or standard deviation or plotting graphs of the data distribution, e.g. a histogram. Some measures like the standard deviation (or the variance) make no statistical sense with closed data and all statistical tests building on the standard deviation (or variance) will thus provide erroneous results if used with the original data.}, + journaltitle = {Mathematical Geology}, + volume = {35}, + number = {3}, + pages = {253--278}, + issn = {08828121}, + doi = {10.1023/A:1023866030544}, langid = {english} } -@article{egozcue2023, - title = {Subcompositional Coherence and and a Novel Proportionality Index of Parts}, - author = {Egozcue, Juan José and Pawlowsky-Glahn, Vera}, - date = {2023}, - journaltitle = {SORT}, - volume = {47}, - number = {2}, - pages = {229--244}, - doi = {10.57645/20.8080.02.7}, - abstract = {Research in compositional data analysis was motivated by spurious (Pearson) correlation. Spurious results are due to semantic incoherence, but the question of ways to relate parts in a statistically consistent way remains open. To solve this problem we frst defne a coherent system of functions with respect to a subcomposition and analyze the space of parts. This leads to understanding why measures like covariance and correlation depend on the subcomposition considered, while measures like the distance between parts are independent of the same. It allows the defnition of a novel index of proportionality between parts.}, +@article{pawlowsky-glahn2001, + title = {Geometric Approach to Statistical Analysis on the Simplex}, + author = {Pawlowsky-Glahn, V. and Egozcue, J. J.}, + date = {2001}, + journaltitle = {Stochastic Environmental Research and Risk Assessment}, + volume = {15}, + number = {5}, + pages = {384--398}, + issn = {14363240}, + doi = {10.1007/s004770100077}, + abstract = {The geometric interpretation of the expected value and the variance in real Euclidean space is used as a starting point to introduce metric counterparts on an arbitrary finite dimensional Hilbert space. This approach allows us to define general reasonable properties for estimators of parameters, like metric unbiasedness and minimum metric variance, resulting in a useful tool to better understand the logratio approach to the statistical analysis of compositional data, who's natural sample space is the simplex.}, langid = {english} } -@article{filzmoser2010, - title = {The Bivariate Statistical Analysis of Environmental (Compositional) Data}, - author = {Filzmoser, Peter and Hron, Karel and Reimann, Clemens}, - date = {2010-09-01}, - journaltitle = {Science of The Total Environment}, - volume = {408}, - number = {19}, - pages = {4230--4238}, - issn = {00489697}, - doi = {10.1016/j.scitotenv.2010.05.011}, - url = {http://linkinghub.elsevier.com/retrieve/pii/S0048969710004845}, - urldate = {2017-08-27}, - abstract = {Environmental sciences usually deal with compositional (closed) data. Whenever the concentration of chemical elements is measured, the data will be closed, i.e. the relevant information is contained in the ratios between the variables rather than in the data values reported for the variables. Data closure has severe consequences for statistical data analysis. Most classical statistical methods are based on the usual Euclidean geometry — compositional data, however, do not plot into Euclidean space because they have their own geometry which is not linear but curved in the Euclidean sense. This has severe consequences for bivariate statistical analysis: correlation coefficients computed in the traditional way are likely to be misleading, and the information contained in scatterplots must be used and interpreted differently from sets of non-compositional data. As a solution, the ilr transformation applied to a variable pair can be used to display the relationship and to compute a measure of stability. This paper discusses how this measure is related to the usual correlation coefficient and how it can be used and interpreted. Moreover, recommendations are provided for how the scatterplot can still be used, and which alternatives exist for displaying the relationship between two variables.}, +@article{rousseeuw1990, + title = {Unmasking {{Multivariate Outliers}} and {{Leverage Points}}}, + author = {Rousseeuw, Peter J. and family=Zomeren, given=Bert C., prefix=van, useprefix=true}, + date = {1990}, + journaltitle = {Journal of the American Statistical Association}, + shortjournal = {Journal of the American Statistical Association}, + volume = {85}, + number = {411}, + pages = {633--639}, + issn = {0162-1459, 1537-274X}, + doi = {10.1080/01621459.1990.10474920}, langid = {english} } -@inproceedings{aitchison1997, - title = {The {{One-Hour Course}} in {{Compositional Data Analysis}} or {{Compositional Data Analysis Is Simple}}}, - booktitle = {{{IAMG}}'97}, - author = {Aitchison, J.}, - editor = {Pawlowsky-Glahn, V.}, - date = {1997}, - pages = {3--35}, - publisher = {International Center for Numerical Methods in Engineering (CIMNE)}, - location = {Barcelona}, - eventtitle = {Third Annual Conference of the {{International Association}} for {{Mathematical Geology}}}, +@article{santos2020, + title = {Modern Methods for Old Data: {{An}} Overview of Some Robust Methods for Outliers Detection with Applications in Osteology}, + shorttitle = {Modern Methods for Old Data}, + author = {Santos, Frédéric}, + date = {2020}, + journaltitle = {Journal of Archaeological Science: Reports}, + shortjournal = {Journal of Archaeological Science: Reports}, + volume = {32}, + pages = {102423}, + issn = {2352409X}, + doi = {10.1016/j.jasrep.2020.102423}, langid = {english} } -@article{egozcue2024, - title = {Exploring Geochemical Data Using Compositional Techniques: {{A}} Practical Guide}, - shorttitle = {Exploring Geochemical Data Using Compositional Techniques}, - author = {Egozcue, Juan José and Gozzi, Caterina and Buccianti, Antonella and Pawlowsky-Glahn, Vera}, - date = {2024-03}, - journaltitle = {Journal of Geochemical Exploration}, - volume = {258}, - pages = {107385}, - issn = {03756742}, - doi = {10.1016/j.gexplo.2024.107385}, - langid = {english} +@book{vandenboogaart2013, + title = {Analyzing {{Compositional Data}} with {{R}}}, + author = {family=Boogaart, given=K. Gerald, prefix=van den, useprefix=true and Tolosana-Delgado, Raimon}, + date = {2013}, + series = {Use {{R}}!}, + publisher = {Springer-Verlag}, + location = {Berlin Heidelberg}, + doi = {10.1007/978-3-642-36809-7}, + isbn = {978-3-642-36808-0}, + langid = {english}, + pagetotal = {285} } -@article{grunsky2024, - title = {{{GeoCoDA}}: {{Recognizing}} and Validating Structural Processes in Geochemical Data. {{A}} Workflow on Compositional Data Analysis in Lithogeochemistry}, - shorttitle = {{{GeoCoDA}}}, - author = {Grunsky, Eric and Greenacre, Michael and Kjarsgaard, Bruce}, - date = {2024-06}, - journaltitle = {Applied Computing and Geosciences}, - volume = {22}, - pages = {100149}, - issn = {25901974}, - doi = {10.1016/j.acags.2023.100149}, +@incollection{weigand1977, + title = {Turquoise {{Sources}} and {{Source Analysisis}}: {{Mesoamerica}} and the {{Southwestern U}}.{{S}}.{{A}}.}, + shorttitle = {Turquoise {{Sources}} and {{Source Analysisis}}}, + booktitle = {Exchange {{Systems}} in {{Prehistory}}}, + author = {Weigand, P. C. and Harbottle, G. and Sayre, E.}, + editor = {Ericson, J. and Earle, T. K.}, + date = {1977}, + pages = {15--34}, + publisher = {Academic Press}, + location = {New York, NY}, langid = {english} } diff --git a/vignettes/groups.Rmd b/vignettes/groups.Rmd index 2ba91f2..445fa9d 100644 --- a/vignettes/groups.Rmd +++ b/vignettes/groups.Rmd @@ -1,5 +1,5 @@ --- -title: "Working with Groups" +title: "Grouped Data" author: "N. Frerebeau" date: "`r Sys.Date()`" output: @@ -28,6 +28,8 @@ knitr::opts_chunk$set( library(nexus) ``` +# Reference Groups + Provenance studies typically rely on two approaches, which can be used together: * Identification of groups among the artifacts being studied, based on mineralogical or geochemical criteria (*clustering*). @@ -43,10 +45,19 @@ data("bronze", package = "folio") coda <- as_composition(bronze, parts = 4:11, groups = 3) ``` -`groups(x)` and `groups(x) <- value` allow to retrieve or set groups of an existing `CompositionMatrix`. Missing values (`NA`) or empty strings can be used to specify that a sample does not belong to any group. +`group()` allow to set groups of an existing `CompositionMatrix`. Missing values (`NA`) can be used to specify that a sample does not belong to any group. + +# Repeated Measurements + +If your data contain several observations for the same sample (e.g. repeated measurements), you can use one or more categorical variable to split the data into subsets and compute the compositional mean for each: + +```{r mean} +## Compositional mean by artefact +coda <- condense(coda, by = list(bronze$dynasty, bronze$reference)) +``` Once groups have been defined, they can be used by further methods (e.g. plotting). -Note that for better readability, you can `select` only some of the parts (e.g. major elements): +Note that for better readability, you can select only some of the parts (e.g. major elements): ```{r barplot, fig.width=7, fig.height=7, out.width='100%'} ## Select major elements @@ -56,13 +67,7 @@ major <- coda[, is_element_major(coda)] barplot(major, order_rows = "Cu", space = 0) ``` -```{r mean, eval=FALSE} -## Compositional mean by artefact -coda <- condense(coda, by = list(bronze$dynasty, bronze$reference)) -``` - -# Multivariate Analysis -## Log-Ratio Analysis +# Log-Ratio Analysis ```{r pca, fig.width=7, fig.height=7, out.width='50%', fig.show='hold'} ## CLR @@ -72,12 +77,40 @@ clr <- transform_clr(coda, weights = TRUE) lra <- pca(clr) ## Visualize results -viz_individuals(lra, color = c("#004488", "#DDAA33", "#BB5566")) -viz_hull(x = lra, border = c("#004488", "#DDAA33", "#BB5566")) +viz_individuals( + x = lra, + extra_quali = group_names(clr), + color = c("#004488", "#DDAA33", "#BB5566"), + hull = TRUE +) viz_variables(lra) ``` +# Discriminant Analysis + +The log-transformed data can be assigned to a new column, allowing us to keep working with the data in the context of the original `data.frame`: + +```{r manova} +## ILR +ilr <- transform_ilr(coda) + +## MANOVA +fit <- manova(ilr ~ group_names(ilr)) +summary(fit) +``` + +The MANOVA results suggest that there are statistically significant differences between groups. + +```{r lda, fig.width=7, fig.height=7, out.width='100%'} +## LDA +discr <- MASS::lda(ilr, grouping = group_names(ilr)) +plot(discr) + +## Back transform results +transform_inverse(discr$means, origin = ilr) +``` + # References Aitchison, J. (1986). *The Statistical Analysis of Compositional Data. Monographs on Statistics and Applied Probability*. Londres, UK ; New York, USA: Chapman and Hall. diff --git a/vignettes/nexus.Rmd b/vignettes/nexus.Rmd index d8d99e5..ac11bfe 100644 --- a/vignettes/nexus.Rmd +++ b/vignettes/nexus.Rmd @@ -25,8 +25,8 @@ Provenance studies rely on the identification of probable sources, such that the **nexus** is designed for chemical fingerprinting and source tracking of ancient materials. It provides provides tools for exploration and analysis of compositional data in the framework of Aitchison (1986). If you are unfamiliar with the concepts and challenges of compositional data analysis, the following publications are a good place to start: -- Egozcue, J. J., Gozzi, C., Buccianti, A. & Pawlowsky-Glahn, V. (2024). Exploring Geochemical Data Using Compositional Techniques: A Practical Guide. *Journal of Geochemical Exploration*, 258 :107385. DOI: [10.1016/j.gexplo.2024.107385](https://doi.org/10.1016/j.gexplo.2024.107385). -- Greenacre, M. & Wood, J. R. (2024). A Comprehensive Workflow for Compositional Data Analysis in Archaeometry, with Code in R. *Archaeological and Anthropological Science*. +- Egozcue, J. J., Gozzi, C., Buccianti, A. & Pawlowsky-Glahn, V. (2024). Exploring Geochemical Data Using Compositional Techniques: A Practical Guide. *Journal of Geochemical Exploration*, 258: 107385. DOI: [10.1016/j.gexplo.2024.107385](https://doi.org/10.1016/j.gexplo.2024.107385). +- Greenacre, M. & Wood, J. R. (2024). A Comprehensive Workflow for Compositional Data Analysis in Archaeometry, with Code in R. *Archaeological and Anthropological Science*, 16: 171. DOI: [10.1007/s12520-024-02070-w](https://doi.org/10.1007/s12520-024-02070-w) - Grunsky, E., Greenacre, M. & Kjarsgaard, B. (2024). GeoCoDA: Recognizing and Validating Structural Processes in Geochemical Data. A Workflow on Compositional Data Analysis in Lithogeochemistry. *Applied Computing and Geosciences*, 22: 100149. DOI: [10.1016/j.acags.2023.100149](https://doi.org/10.1016/j.acags.2023.100149). # Get started