-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cell type annotation: Harmony/KNN workflow #836
base: main
Are you sure you want to change the base?
Changes from all commits
776aa42
6f49154
0aead50
3173799
42628e9
1ed4338
c1bb7dd
0ca63dd
02b6d67
3f97c73
1367a25
725cb22
aab55fc
ed65300
6c42512
64b5df4
27a6509
acb909b
5b55c59
ab5c8a2
f468f3b
855cb7f
966e9b9
e2049f1
68f9446
adfb3f3
08f6b60
ee5e9bf
1648952
23d3c26
40f7ab2
89c4242
a6e49a3
71fa4bd
3ccbb81
5850c1a
c20a529
8f1a64e
7304634
c67a4ff
3f36a07
cf8bc4d
f4efba8
8a893fb
3b399b2
99b66a4
8993be5
9b9f7de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
name: "harmony_knn" | ||
namespace: "workflows/annotation" | ||
description: "Cell type annotation workflow by performing harmony integration of reference and query dataset followed by KNN label transfer." | ||
authors: | ||
- __merge__: /src/authors/dorien_roosen.yaml | ||
roles: [ author, maintainer ] | ||
- __merge__: /src/authors/weiwei_schultz.yaml | ||
roles: [ contributor ] | ||
|
||
argument_groups: | ||
- name: Query Input | ||
arguments: | ||
- name: "--id" | ||
required: true | ||
type: string | ||
description: ID of the sample. | ||
example: foo | ||
- name: "--input" | ||
required: true | ||
type: file | ||
description: Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference. | ||
example: input.h5mu | ||
- name: "--modality" | ||
description: Which modality to process. Should match the modality of the --reference dataset. | ||
type: string | ||
default: "rna" | ||
required: false | ||
- name: "--input_obsm_embedding" | ||
example: "X_pca" | ||
type: string | ||
description: Embedding .obsm column to use as input for integration. Should match the embedding .obsm columng of the --reference dataset. | ||
- name: "--input_obs_batch_label" | ||
type: string | ||
description: "The .obs field in the input (query) dataset containing the batch labels." | ||
example: "sample" | ||
required: true | ||
- name: "--overwrite_existing_key" | ||
type: boolean_true | ||
description: If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process. | ||
|
||
- name: Reference input | ||
arguments: | ||
- name: "--reference" | ||
required: true | ||
type: file | ||
description: Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset. | ||
example: reference.h5mu | ||
- name: "--reference_obs_targets" | ||
type: string | ||
example: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ] | ||
required: true | ||
multiple: true | ||
description: The `.obs` key(s) of the target labels to transfer. | ||
- name: "--reference_obs_batch_label" | ||
type: string | ||
description: "The .obs field in the reference dataset containing the batch labels." | ||
example: "sample" | ||
required: true | ||
|
||
- name: Harmony integration options | ||
arguments: | ||
- name: "--theta" | ||
type: double | ||
description: | | ||
Diversity clustering penalty parameter. Specify for each variable in group.by.vars. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When reading this, I am not sure that I know what |
||
A value of theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." | ||
min: 0 | ||
default: [2] | ||
multiple: true | ||
|
||
- name: Leiden clustering options | ||
arguments: | ||
- name: "--leiden_resolution" | ||
type: double | ||
description: Control the coarseness of the clustering. Higher values lead to more clusters. | ||
min: 0 | ||
default: [1] | ||
multiple: true | ||
|
||
- name: Neighbor classifier arguments | ||
arguments: | ||
- name: "--weights" | ||
type: string | ||
default: "uniform" | ||
choices: ["uniform", "distance"] | ||
description: | | ||
Weight function used in prediction. Possible values are: | ||
`uniform` (all points in each neighborhood are weighted equally) or | ||
`distance` (weight points by the inverse of their distance) | ||
- name: "--n_neighbors" | ||
type: integer | ||
default: 15 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add |
||
required: false | ||
description: | | ||
The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. | ||
Larger values will result in more accurate search results at the cost of computation time. | ||
|
||
- name: "Outputs" | ||
arguments: | ||
- name: "--output" | ||
type: file | ||
required: true | ||
direction: output | ||
description: The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference. | ||
example: output.h5mu | ||
- name: "--output_obs_predictions" | ||
type: string | ||
required: false | ||
multiple: true | ||
description: | | ||
In which `.obs` slots to store the predicted cell labels. | ||
If provided, must have the same length as `--reference_obs_targets`. | ||
If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix. | ||
- name: "--output_obs_probability" | ||
type: string | ||
required: false | ||
multiple: true | ||
description: | | ||
In which `.obs` slots to store the probability of the predictions. | ||
If provided, must have the same length as `--reference_obs_targets`. | ||
If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix. | ||
- name: "--output_obsm_integrated" | ||
type: string | ||
default: "X_integrated_harmony" | ||
required: false | ||
description: "In which .obsm slot to store the integrated embedding." | ||
- name: "--output_compression" | ||
type: string | ||
description: | | ||
The compression format to be used on the output h5mu object. | ||
choices: ["gzip", "lzf"] | ||
required: false | ||
example: "gzip" | ||
|
||
dependencies: | ||
- name: workflows/integration/harmony_leiden | ||
alias: harmony_leiden_workflow | ||
- name: labels_transfer/knn | ||
- name: dataflow/split_h5mu | ||
- name: dataflow/concatenate_h5mu | ||
- name: metadata/add_id | ||
- name: metadata/duplicate_obs | ||
|
||
resources: | ||
- type: nextflow_script | ||
path: main.nf | ||
entrypoint: run_wf | ||
|
||
test_resources: | ||
- type: nextflow_script | ||
path: test.nf | ||
entrypoint: test_wf | ||
- path: /resources_test/scgpt | ||
|
||
runners: | ||
- type: nextflow |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
# get the root of the directory | ||
REPO_ROOT=$(git rev-parse --show-toplevel) | ||
|
||
# ensure that the command below is run from the root of the repository | ||
cd "$REPO_ROOT" | ||
|
||
nextflow \ | ||
run . \ | ||
-main-script src/workflows/annotation/harmony_knn/test.nf \ | ||
-entry test_wf \ | ||
-resume \ | ||
-profile docker,no_publish \ | ||
-c src/workflows/utils/labels_ci.config \ | ||
-c src/workflows/utils/integration_tests.config \ |
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,162 @@ | ||||||||||||||||||
workflow run_wf { | ||||||||||||||||||
take: | ||||||||||||||||||
input_ch | ||||||||||||||||||
|
||||||||||||||||||
main: | ||||||||||||||||||
|
||||||||||||||||||
output_ch = input_ch | ||||||||||||||||||
// Set aside the output for this workflow to avoid conflicts | ||||||||||||||||||
| map {id, state -> | ||||||||||||||||||
def new_state = state + ["workflow_output": state.output] | ||||||||||||||||||
[id, new_state] | ||||||||||||||||||
} | ||||||||||||||||||
Comment on lines
+9
to
+12
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
// add id as _meta join id to be able to merge with source channel and end of workflow | ||||||||||||||||||
| map{ id, state -> | ||||||||||||||||||
def new_state = state + ["_meta": ["join_id": id]] | ||||||||||||||||||
[id, new_state] | ||||||||||||||||||
} | ||||||||||||||||||
Comment on lines
+13
to
+17
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is |
||||||||||||||||||
| view {"After adding join_id: $it"} | ||||||||||||||||||
// Add 'query' id to .obs columns of query dataset | ||||||||||||||||||
| add_id.run( | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
And since you have |
||||||||||||||||||
fromState: [ | ||||||||||||||||||
"input": "input", | ||||||||||||||||||
], | ||||||||||||||||||
args:[ | ||||||||||||||||||
"input_id": "query", | ||||||||||||||||||
"obs_output": "dataset", | ||||||||||||||||||
], | ||||||||||||||||||
toState: ["input": "output"] | ||||||||||||||||||
) | ||||||||||||||||||
// Add 'reference'id to .obs columns of reference dataset | ||||||||||||||||||
| add_id.run( | ||||||||||||||||||
fromState:[ | ||||||||||||||||||
"input": "reference", | ||||||||||||||||||
], | ||||||||||||||||||
args:[ | ||||||||||||||||||
"input_id": "reference", | ||||||||||||||||||
"obs_output": "dataset" | ||||||||||||||||||
], | ||||||||||||||||||
toState: ["reference": "output"] | ||||||||||||||||||
) | ||||||||||||||||||
// Make sure that query and reference dataset have batch information in the same .obs column | ||||||||||||||||||
// By copying the respective .obs columns to the obs column "batch_label" | ||||||||||||||||||
| duplicate_obs.run( | ||||||||||||||||||
fromState: [ | ||||||||||||||||||
"input": "input", | ||||||||||||||||||
"modality": "modality", | ||||||||||||||||||
"input_obs_key": "input_obs_batch_label", | ||||||||||||||||||
"overwrite_existing_key": "overwrite_existing_key" | ||||||||||||||||||
], | ||||||||||||||||||
args: [ | ||||||||||||||||||
"output_obs_key": "batch_label" | ||||||||||||||||||
], | ||||||||||||||||||
toState: [ | ||||||||||||||||||
"input": "output" | ||||||||||||||||||
] | ||||||||||||||||||
) | ||||||||||||||||||
| duplicate_obs.run( | ||||||||||||||||||
fromState: [ | ||||||||||||||||||
"input": "reference", | ||||||||||||||||||
"modality": "modality", | ||||||||||||||||||
"input_obs_key": "reference_obs_batch_label", | ||||||||||||||||||
"overwrite_existing_key": "overwrite_existing_key" | ||||||||||||||||||
], | ||||||||||||||||||
args: [ | ||||||||||||||||||
"output_obs_key": "batch_label" | ||||||||||||||||||
], | ||||||||||||||||||
toState: [ | ||||||||||||||||||
"reference": "output" | ||||||||||||||||||
] | ||||||||||||||||||
) | ||||||||||||||||||
// Concatenate query and reference datasets prior to integration | ||||||||||||||||||
| concatenate_h5mu.run( | ||||||||||||||||||
fromState: { id, state -> [ | ||||||||||||||||||
"input": [state.input, state.reference] | ||||||||||||||||||
] | ||||||||||||||||||
}, | ||||||||||||||||||
args: [ | ||||||||||||||||||
"input_id": ["query", "reference"], | ||||||||||||||||||
"other_axis_mode": "move" | ||||||||||||||||||
], | ||||||||||||||||||
toState: ["input": "output"] | ||||||||||||||||||
) | ||||||||||||||||||
| view {"After concatenation: $it"} | ||||||||||||||||||
// Run harmony integration with leiden clustering | ||||||||||||||||||
| harmony_leiden_workflow.run( | ||||||||||||||||||
fromState: { id, state -> | ||||||||||||||||||
[ | ||||||||||||||||||
"id": id, | ||||||||||||||||||
"input": state.input, | ||||||||||||||||||
"modality": state.modality, | ||||||||||||||||||
"embedding": state.obsm_embedding, | ||||||||||||||||||
"obsm_integrated": state.output_obsm_integrated, | ||||||||||||||||||
"theta": state.theta, | ||||||||||||||||||
"leiden_resolution": state.leiden_resolution, | ||||||||||||||||||
] | ||||||||||||||||||
}, | ||||||||||||||||||
args: [ | ||||||||||||||||||
"uns_neighbors": "harmonypy_integration_neighbors", | ||||||||||||||||||
"obsp_neighbor_distances": "harmonypy_integration_distances", | ||||||||||||||||||
"obsp_neighbor_connectivities": "harmonypy_integration_connectivities", | ||||||||||||||||||
"obs_cluster": "harmony_integration_leiden", | ||||||||||||||||||
"obsm_umap": "X_leiden_harmony_umap", | ||||||||||||||||||
"obs_covariates": "batch_label" | ||||||||||||||||||
], | ||||||||||||||||||
toState: ["input": "output"] | ||||||||||||||||||
) | ||||||||||||||||||
| view {"After integration: $it"} | ||||||||||||||||||
// Split integrated dataset back into a separate reference and query dataset | ||||||||||||||||||
| split_h5mu.run( | ||||||||||||||||||
fromState: [ | ||||||||||||||||||
"input": "input", | ||||||||||||||||||
"modality": "modality" | ||||||||||||||||||
], | ||||||||||||||||||
args: [ | ||||||||||||||||||
"obs_feature": "dataset", | ||||||||||||||||||
"output_files": "sample_files.csv", | ||||||||||||||||||
"drop_obs_nan": "true", | ||||||||||||||||||
"output": "ref_query" | ||||||||||||||||||
], | ||||||||||||||||||
toState: [ | ||||||||||||||||||
"output": "output", | ||||||||||||||||||
"output_files": "output_files" | ||||||||||||||||||
], | ||||||||||||||||||
auto: [ publish: true ] | ||||||||||||||||||
) | ||||||||||||||||||
| view {"After sample splitting: $it"} | ||||||||||||||||||
// map the integrated query and reference datasets back to the state | ||||||||||||||||||
| map {id, state -> | ||||||||||||||||||
def outputDir = state.output | ||||||||||||||||||
def files = readCsv(state.output_files.toUriString()) | ||||||||||||||||||
def query_file = files.findAll{ dat -> dat.name == 'query' } | ||||||||||||||||||
assert query_file.size() == 1, 'there should only be one query file' | ||||||||||||||||||
def reference_file = files.findAll{ dat -> dat.name == 'reference' } | ||||||||||||||||||
assert reference_file.size() == 1, 'there should only be one reference file' | ||||||||||||||||||
def integrated_query = outputDir.resolve(query_file.filename) | ||||||||||||||||||
def integrated_reference = outputDir.resolve(reference_file.filename) | ||||||||||||||||||
def newKeys = ["integrated_query": integrated_query, "integrated_reference": integrated_reference] | ||||||||||||||||||
[id, state + newKeys] | ||||||||||||||||||
} | ||||||||||||||||||
| view {"After splitting query: $it"} | ||||||||||||||||||
// Perform KNN label transfer from integrated reference to integrated query | ||||||||||||||||||
| knn.run( | ||||||||||||||||||
fromState: [ | ||||||||||||||||||
"input": "integrated_query", | ||||||||||||||||||
"modality": "modality", | ||||||||||||||||||
"input_obsm_features": "output_obsm_integrated", | ||||||||||||||||||
"reference": "integrated_reference", | ||||||||||||||||||
"reference_obsm_features": "output_obsm_integrated", | ||||||||||||||||||
"reference_obs_targets": "reference_obs_targets", | ||||||||||||||||||
"output_obs_predictions": "output_obs_predictions", | ||||||||||||||||||
"output_obs_probability": "output_obs_probability", | ||||||||||||||||||
"output_compression": "output_compression", | ||||||||||||||||||
"weights": "weights", | ||||||||||||||||||
"n_neighbors": "n_neighbors", | ||||||||||||||||||
"output": "workflow_output" | ||||||||||||||||||
], | ||||||||||||||||||
toState: {id, output, state -> ["output": output.output]}, | ||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
emit: | ||||||||||||||||||
output_ch | ||||||||||||||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
manifest { | ||
nextflowVersion = '!>=20.12.1-edge' | ||
} | ||
|
||
params { | ||
rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() | ||
} | ||
|
||
// include common settings | ||
includeConfig("${params.rootDir}/src/workflows/utils/labels.config") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add the
test_dependencies
toinfo
?