-
Notifications
You must be signed in to change notification settings - Fork 0
/
immuneml_create_dataset.xml
178 lines (161 loc) · 13 KB
/
immuneml_create_dataset.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
<tool id="immune_ml_dataset" name="Create dataset" version="@[email protected]">
<description></description>
<macros>
<import>prod_macros.xml</import>
</macros>
<expand macro="requirements" />
<command><![CDATA[
#set $input_orig_names = []
#for $input in $interface_cond.data_input
#if $input
#set input_orig_names += ["./"+str($input.element_identifier)]
([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
#end if
#end for
#if $interface_cond.interface == "simple"
python3 '$__tool_directory__/build_dataset_yaml_wrapper.py'
--output_path . --file_name specs.yaml
#if $interface_cond.dataset_cond.dataset_type == "repertoire"
--is_repertoire True
--format "$interface_cond.dataset_cond.metadata_cond.data_format"
#if $interface_cond.dataset_cond.metadata_cond.data_format != "IReceptor"
--metadata_file "$interface_cond.dataset_cond.metadata_cond.metadata_input" &&
cp $interface_cond.dataset_cond.metadata_cond.metadata_input "$interface_cond.dataset_cond.metadata_cond.metadata_input.element_identifier"
#end if
#else
--is_repertoire False
--format "$interface_cond.dataset_cond.data_format"
--metadata_columns "$interface_cond.dataset_cond.metadata_columns"
#if $interface_cond.dataset_cond.dataset_type == "sequence"
--paired False
#elif $interface_cond.dataset_cond.dataset_type == "receptor"
--paired True
--receptor_chains $interface_cond.dataset_cond.receptor_type
#end if
#end if
&& mv ./specs.yaml create_dataset.yaml &&
#else
cp $yaml_input create_dataset.yaml &&
#end if
immune-ml ./create_dataset.yaml ${html_outfile.files_path} --tool DatasetGenerationTool &&
mv ${html_outfile.files_path}/index.html ${html_outfile} &&
mv ./create_dataset.yaml ${specs}
]]>
</command>
<inputs>
<conditional name="interface_cond">
<param type="select" name="interface" label="Which interface would you like to use?" display="radio">
<option value="simple">Simplified (limited options)</option>
<option value="advanced">Advanced (full control through YAML) </option>
</param>
<when value="simple">
<conditional name="dataset_cond">
<param type="select" name="dataset_type" label="Dataset type" display="radio" help="Repertoire datasets
should be used when making predictions per repertoire, such as predicting a disease state. Sequence or
receptor datasets should be used when predicting values for unpaired (single-chain) and paired immune
receptors respectively, like antigen specificity.">
<option value="repertoire">Repertoire dataset</option>
<option value="sequence">Sequence dataset (single chain)</option>
<option value="receptor">Receptor dataset (paired chains)</option>
</param>
<when value="repertoire">
<conditional name="metadata_cond">
<param type="select" name="data_format" label="Data format" display="radio">
<option value="AIRR">AIRR</option>
<option value="IReceptor">iReceptor Gateway</option>
<option value="ImmunoSEQRearrangement">immunoSEQ: rearrangement-level files</option>
<option value="ImmunoSEQSample">immunoSEQ: sample-level files</option>
<option value="MiXCR">MiXCR</option>
<option value="VDJdb">VDJdb</option>
<option value="TenxGenomics">10x Genomics ‘Clonotype consensus annotations’ (CSV)</option>
</param>
<when value="AIRR">
<param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire ARR file described in the metadata file must be selected under 'Data files'."/>
</when>
<when value="ImmunoSEQRearrangement">
<param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire immunoSEQ rearrangement file described in the metadata file must be selected under 'Data files'."/>
</when>
<when value="ImmunoSEQSample">
<param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire immunoSEQ sample file described in the metadata file must be selected under 'Data files'."/>
</when>
<when value="MiXCR">
<param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire MiXCR file described in the metadata file must be selected under 'Data files'."/>
</when>
<when value="VDJdb">
<param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire VDJdb file described in the metadata file must be selected under 'Data files'."/>
</when>
<when value="TenxGenomics">
<param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire 10x Genomics file described in the metadata file must be selected under 'Data files'."/>
</when>
</conditional>
</when>
<when value="sequence">
<param type="select" name="data_format" label="Data format" display="radio">
<option value="AIRR">AIRR</option>
<option value="IReceptor">iReceptor Gateway</option>
<option value="ImmunoSEQRearrangement">ImmunoSEQ: rearrangement-level files</option>
<option value="ImmunoSEQSample">ImmunoSEQ: sample-level files</option>
<option value="MiXCR">MiXCR</option>
<option value="VDJdb">VDJdb</option>
<option value="TenxGenomics">10xGenomics ‘Clonotype consensus annotations’ (CSV)</option>
</param>
<param type="text" name="metadata_columns" optional="false" label="Metadata columns" help="Please
specify the names of the columns that contain metadata. The metadata columns specified here can be
used as labels for prediction. Multiple metadata columns may be specified and separated by comma,
for example: Epitope,Epitope gene,Epitope species"/>
</when>
<when value="receptor">
<param type="select" name="data_format" label="Data format" display="radio">
<option value="AIRR">AIRR</option>
<option value="IReceptor">iReceptor Gateway</option>
<option value="VDJdb">VDJdb</option>
<option value="TenxGenomics">10xGenomics ‘Clonotype consensus annotations’ (CSV)</option>
</param>
<param type="select" name="receptor_type" label="Receptor type" display="radio">
<option value="TRA_TRB">T cell alpha/beta</option>
<option value="TRG_TRD">T cell gamma/delta</option>
<option value="IGH_IGL">B cell heavy/light</option>
<option value="IGH_IGK">B cell heavy/kappa</option>
</param>
<param type="text" name="metadata_columns" optional="false" label="Metadata columns" help="Please
specify the names of the columns that contain metadata. The metadata columns specified here can be
used as labels for prediction. Multiple metadata columns may be specified and separated by comma,
for example: Epitope,Epitope gene,Epitope species"/>
</when>
</conditional>
<param name="data_input" type="data" multiple="true" label="Data files" min="1" max="2000" help="This field should include individual repertoire or receptor files, or iReceptor zip files. Multiple files can be selected by holding down the control/command or shift key, or by clicking 'browse datasets' (folder button on the right). Important: make sure all the files you want to include in the dataset are highlighted in blue or gray."/>
</when>
<when value="advanced">
<param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
<param name="data_input" type="data" multiple="true" label="Data and metadata files" optional="true" help="This field should include individual repertoire or receptor files, or iReceptor zip files, and optionally a metadata file. Multiple files can be selected by holding down the control/command or shift key, or by clicking 'browse datasets' (folder button on the right). Important: make sure all the files you want to include in the dataset are highlighted."/>
</when>
</conditional>
</inputs>
<outputs>
<data format="txt" name="specs" label="create_dataset.yaml"/>
<data format="immuneml_receptors" name="html_outfile" label="ImmuneML dataset"/>
</outputs>
<help><![CDATA[
In Galaxy, an immuneML dataset is simply a Galaxy collection containing all relevant files (including an optional metadata file).
The Create dataset Galaxy tool allows users to import data from various formats and create immuneML datasets in Galaxy.
These datasets are in an optimized binary (Pickle) format, which ensures that you can quickly import the dataset into
Galaxy tools without having to repeatedly specify the import parameters.
Before creating a dataset, the relevant data files must first be uploaded to the Galaxy interface. This can be done either
by uploading files from your local computer (use the 'Upload file' tool under the 'Get local data' menu), or by fetching
remote data from the iReceptor Plus Gateway or VDJdb (see `How to import remote AIRR datasets in Galaxy <https://docs.immuneml.uio.no/latest/galaxy/galaxy_import_remote_data.html>`_).
The imported immuneML dataset is stored in a Galaxy collection, which will appear as a history item on the right side of the screen,
and can later be selected as input to other tools.
The tool has a simplified and an advanced interface. The simplified interface is fully button-based, and relies
on default settings for importing datasets. The advanced interface gives full control over import settings through a YAML
specification. In most cases, the simplified interface will suffice.
For the exhaustive documentation of this tool and more information about immuneML datasets, see the tutorial `How to make an immuneML dataset in Galaxy <https://docs.immuneml.uio.no/latest/galaxy/galaxy_dataset.html>`_.
**Tool output**
This Galaxy tool will produce the following history elements:
- ImmuneML dataset: a sequence, receptor or repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
(which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
- create_dataset.yaml: the YAML specification file that was used by immuneML to create the dataset.
This file can be downloaded and altered (for example to export files in AIRR format, or use non-standard import parameters),
and run again using the 'Advanced' interface.
]]>
</help>
</tool>