Skip to content

Commit

Permalink
Add chuck-size parameter to cylinter config. (#181)
Browse files Browse the repository at this point in the history
  • Loading branch information
gjbaker authored Mar 14, 2024
1 parent c2be19f commit 82d3261
Show file tree
Hide file tree
Showing 2 changed files with 355 additions and 23 deletions.
344 changes: 344 additions & 0 deletions cylinter/cylinter_config 1.29.23 PM.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,344 @@
# GENERAL PROGRAM CONFIGURATIONS

inDir: /Users/<username>/Desktop/cylinter_demo
# Path to CyLinter input directory containing multi-channel
# image files (TIF or OME-TIF), segmentation outlines (OME-TIF),
# segmentation masks (TIF), and corresponding single-cell feature tables (CSV)

outDir: /Users/<username>/Desktop/cylinter_demo/output
# CyLinter output directory. Path is created if it does not exist.

sampleMetadata:
"1": ["1", "Normal kidney cortex", "NKC", "CANCER-FALSE", 1]
"15": ["15", "Glioblastoma", "GBM", "CANCER-TRUE", 1]
"18": ["18", "Mesothelioma", "MTO", "CANCER-TRUE", 1]
"68": ["68", "Tonsil", "TSL", "CANCER-FALSE", 3]
# Sample metadata dictionary: keys = file names; values = list of strings.
# First elements: sample names (str)
# Second elements: descriptive text of experimental condition (str)
# Third elements: abbreviation of experimental condition (str)
# Fourth elements: comma-delimited string of arbitrary binary declarations
# for computing t-statistics between two groups of samples (str dytpe)
# Fifth elements: replicate number specifying biological or
# technical replicates (int)

samplesToExclude: []
# (list of strs) Sample names to exclude from analysis specified
# according to the first elements of sampleMetadata configuration.

counterstainChannel: "DNA1"
# (str) Name of marker in markers.csv file for use in visualizing nuclear counterstain

markersToExclude: ["Rabbit IgG", "Goat IgG", "Mouse IgG", "CD56", "CD13",
"pAUR", "CCNE", "CDKN2A", "PCNA_1", "CDKN1B_2",
"CD63", "CD32", "CCNA2", "CDKN1C", "PCNA_1",
"CDKN1B_1", "CCND1", "cPARP", "pCREB",
"CCNB1", "PCNA_2", "CDK2"
]
# (list of strs) Immunomarkers to exclude from analysis
# Does not include nuclear dyes. They are needed for the
# cycleCorrelation module to remove cell dropout.

###############################################################################
# MODULE-SPECIFIC CONFIGURATIONS

# selectROIs-------------------------------------------------------------------
delintMode: True
# (bool) Whether to drop (True; negative selection) or
# retain (False; positive selection) cells selected by ROIs.

showAbChannels: True
# (bool) Whether to show all immunomarker channels (True) when Napari
# is open (may be memory limiting) or show cycle 1 DNA only (False).

samplesForROISelection: ["1", "15", "18", "68"]
# (list of strs) Sample names for ROI selection specified
# according to the first elements of sampleMetadata configuration.

autoArtifactDetection: True
# (bool) Whether to display tools for automated artifact detection in Napari window

artifactDetectionMethod: "classical"
# (str) Algorithm used for automated artifact detection (current option: "classical").
# Multi-layer perceptron method ("MLP") currently under development.


# intensityFilter-------------------------------------------------------------------
numBinsIntensity: 50
# (int) Number of bins for DNA intensity histograms.


# areaFilter-------------------------------------------------------------------
numBinsArea: 50
# (int) Number of bins for DNA area histograms.


# cycleCorrelation-------------------------------------------------------------------
numBinsCorrelation: 50
# (int) Number of bins for DNA1/DNAn histograms.


# pruneOutliers-------------------------------------------------------------------
hexbins: False
# (bool) Whether to use hexbins (True) or scatter plots (False) to plot
# single-cell signal intensities. Scatter plots allow for higher resolution,
# but may require longer rendering times.

hexbinGridSize: 20
# (int) Hexbin grid size when hexins=True.
# Higher values increase bin resolution.


# metaQC (optional)-------------------------------------------------------------------
metaQC: False
# (bool) Whether to perform data reclassification based on
# unsupervised clustering results of combinations of clean and
# noisy (previously-redacted) data.

embeddingAlgorithmQC: "UMAP"
# (str) Embedding algorithm used for clustering (options: "TSNE" or "UMAP").

channelExclusionsClusteringQC: []
# (list of strs) Immunomarkers to exclude from clustering.

samplesToRemoveClusteringQC: []
# (list of strs) Samples to exclude from clustering.

fracForEmbeddingQC: 1.0
# (float) Fraction of cells to be embedded (range: 0.0-1.0)
# Limits the amount of data passed to downstream modules.

dimensionEmbeddingQC: 2
# (int) Dimension of the embedding (fixed to 2 in current version).

topMarkersQC: "clusters"
# (str) Normalization axis ("channels" or "clusters") used to define
# highest expressed markers per cluster.

colormapAnnotationQC: "Sample"
# (str) Metadata annotation to colormap the embedding: Sample or Condition.

metricQC: "euclidean"
# (str) Distance metric for computing embedding.
# Choose from valid metrics used by scipy.spatial.distance.pdist:
# "braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine",
# "dice", "euclidean", "hamming", "jaccard", "jensenshannon", "kulsinski",
# "mahalanobis", "matching", "minkowski", "rogerstanimoto", "russellrao",
# "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule".

# --------------------------------------
# tSNE-specific configurations:
# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
perplexityQC: 50.0
# (float) Related to the number of nearest neighbors used in other
# manifold learning algorithms. Larger datasets usually require
# larger perplexity. Different values can result in significantly
# different results.

earlyExaggerationQC: 12.0
# (float) For larger values, the space between natural clusters
# will be larger in the embedded space.

learningRateTSNEQC: 200.0
# (float) tSNE learning rate (typically between 10.0 and 1000.0).

randomStateQC: 5
# (int) Determines the random number generator for reproducible results
# across multiple function calls.

# --------------------------------------
# UMAP-specific configurations:
# https://umap-learn.readthedocs.io/en/latest/api.html
nNeighborsQC: 6
# (int) The size of local neighborhood (in terms of number of
# neighboring sample points) used for manifold approximation.
# Larger values result in more global views of the manifold,
# while smaller values result in more local data being preserved.
# In general values should be in the range 2 to 100.

learningRateUMAPQC: 1.0
# (float) The initial learning rate for the embedding optimization.

minDistQC: 0.1
# (float) The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped
# embedding where nearby points on the manifold are drawn
# closer together, while larger values will result on a more
# even dispersal of points. The value should be set relative
# to the spread value, which determines the scale at which
# embedded points will be spread out.

repulsionStrengthQC: 5.0
# (float) Weighting applied to negative samples in low dimensional
# embedding optimization. Values higher than one will
# result in greater weight being given to negative samples.


# PCA-------------------------------------------------------------------
channelExclusionsPCA: []
# (strs) Immunomarkers to exclude from PCA analysis.

samplesToRemovePCA: []
# (list of strs) Samples to exclude from PCA analysis.

dimensionPCA: 2
# (int) Number of PCs to compute.

pointSize: 90.0
# (float) scatter point size for sample scores plot.

labelPoints: True
# (bool) Annotate scatter points with condition abbreviations
# from sampleMetadata configuration.

distanceCutoff: 0.15
# (float) Maximum distance between data points in PCA scores plot to
# be annotated with a common label. Useful for increasing visual clarity
# of PCA plots containing many data points. Applicable when
# labelPoints is True.

conditionsToSilhouette: []
# (list of strs) List of abbreviated condition names whose corresponding
#scores plot points will be greyed out, left unannotated, and sent to the back
# of the plot (zorder). Useful for increasing visual clarity of PCA
# plots containing many data points.


# gating (optional)-------------------------------------------------------------------
gating: True
# (bool) Whether to perform SYLARAS-style gating on single-cell data.
# Cell Syst. 2020 Sep 23;11(3):272-285.e9 PMID: 32898474

channelExclusionsGating: []
# (list of strs) Immunomarkers to exclude from gating.

samplesToRemoveGating: []
# (list of strs) Samples to exclude from gating.

vectorThreshold: 100
# (int) vizualize Boolean vectors with cell counts >= vectorThreshold

classes:
Tumor:
definition: [+pan-CK, +KI67]
subsets: []
# (dict) Boolean immunophenotype signatures.
# +marker = immunopositive , -marker = immunonegative, marker = don't care


# clustering-------------------------------------------------------------------
embeddingAlgorithm: "UMAP"
# (str) Embedding algorithm to use for clustering (options: "TSNE" or "UMAP").

channelExclusionsClustering: []
# (list of strs) Immunomarkers to exclude from clustering.

samplesToRemoveClustering: []
# (list of strs) Samples to exclude from clustering.

normalizeTissueCounts: True
# (bool) Make the number of cells per tissue for clustering more similar
# through sample-weighted random sampling.

fracForEmbedding: 1.0
# (float) Fraction of cells to be embedded (range: 0.0-1.0).
# Limits amount of data passed to downstream modules.

dimensionEmbedding: 2
# (int) Dimension of the embedding (options: 2 or 3).

topMarkers: "clusters"
# (str) Normalization axis ("channels" or "clusters") used to define
# highest expressed markers per cluster.

colormapAnnotationClustering: "Sample"
# (str) Metadata annotation to colormap the embedding: Sample or Condition.

metric: "euclidean"
# (str) Distance metric for computing embedding.
# Choose from valid metrics used by scipy.spatial.distance.pdist:
# "braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine",
# "dice", "euclidean", "hamming", "jaccard", "jensenshannon", "kulsinski",
# "mahalanobis", "matching", "minkowski", "rogerstanimoto", "russellrao",
# "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule".

# --------------------------------------
# tSNE-specific configurations:
# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
perplexity: 50.0
# (float) Related to the number of nearest neighbors used in other
# manifold learning algorithms. Larger datasets usually require
# larger perplexity. Different values can result in significantly
# different results.

earlyExaggeration: 12.0
# (flaot) For larger values, the space between natural clusters
# will be larger in the embedded space.

learningRateTSNE: 200.0
# (float) tSNE learning rate (typically between 10.0 and 1000.0).

randomStateTSNE: 5
# (int) Determines the random number generator for reproducible results
# across multiple function calls.

# --------------------------------------
# UMAP-specific configurations:
# https://umap-learn.readthedocs.io/en/latest/api.html
nNeighbors: 6
# (int) The size of local neighborhood (in terms of number of
# neighboring sample points) used for manifold approximation.
# Larger values result in more global views of the manifold,
# while smaller values result in more local data being preserved.
# In general values should be in the range 2 to 100.

learningRateUMAP: 1.0
# (float) The initial learning rate for the embedding optimization.

minDist: 0.1
# (float) The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped
# embedding where nearby points on the manifold are drawn
# closer together, while larger values will result on a more
# even dispersal of points. The value should be set relative
# to the spread value, which determines the scale at which
# embedded points will be spread out.

repulsionStrength: 5.0
# (float) Weighting applied to negative samples in low dimensional
# embedding optimization. Values higher than one will
# result in greater weight being given to negative samples.

randomStateUMAP: 5
# (int) Determines the random number generator for reproducible results
# across multiple function calls.


# frequencyStats-------------------------------------------------------------------
controlGroups: ["CANCER-FALSE"]
# (list of strs) Corresponds to control groups for each binary declaration
# specified as the third elements of sampleMetadata values.

denominatorCluster: null
# (None type) Cluster to be used as the denominator when computing cluster
# frequency ratios. Set to null first, change to cluster integer number
# to normalize cluster frequencies to a particular cluster if desired.

FDRCorrection: False
# (bool) Whether to compute p-vals and false discovery rate (FDR)-corrected
# q-vals (True) or compute uncorrected p-vals only (False).


# curateThumbnails-------------------------------------------------------------
numThumbnails: 25
# (int) Number of examples per cluster to be curated.

topMarkersThumbnails: "clusters"
# (str) Normalization axis ("channels" or "clusters") used to define
# highest expressed markers per cluster.

windowSize: 30
# (int) Number of pixels in x and y dimensions per thumbnail.

segOutlines: True
# (bool) Whether to overlay cell segmentation outlines on thumbnail images.
Loading

0 comments on commit 82d3261

Please sign in to comment.