Skip to content

Commit

Permalink
checked files:before append appendAdapters
Browse files Browse the repository at this point in the history
  • Loading branch information
Rong Li Lab authored and Rong Li Lab committed May 31, 2017
1 parent ae28014 commit ae60d7e
Show file tree
Hide file tree
Showing 16 changed files with 117 additions and 195 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@
*.nsq
*.log

# MatLab temp files
*.asv
22 changes: 9 additions & 13 deletions Db/Mouse.parameters.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
<!-- general parameters -->
<general>
<verbose>1</verbose>
<key1>cdna</key1>
<key2>ncrna</key2>
<key3>gene:\S*</key3>
<key4>gene_biotype:\S*</key4>
</general>

<!-- rnaSeq parameters -->
Expand All @@ -20,8 +24,6 @@
<!-- cdnaParse.m -->
<cdna>
<dir1>C:\FISHerMan\Db\Mouse38.cdna.fa</dir1>
<key1>ENS\w*T\d*</key1>
<key2>ENS\w*G\d*</key2>
</cdna>

<!-- ncrna parameters -->
Expand All @@ -30,20 +32,16 @@
<dir1>C:\FISHerMan\Db\Mouse38.ncrna.fa</dir1>
<tRNA>1</tRNA>
<dirT>C:\FISHerMan\Db\Mouse.trna.fas</dirT>
<key1>ENS\w*T\d*</key1>
<key2>ENS\w*G\d*</key2>
<key3>gene_biotype:\S*</key3>
</ncrna>

<!-- abundantrna parameters -->
<!-- abundantrnaParse.m -->
<abundantrna>
<percent>0.001</percent>
<key1>ENS\w*T\d*</key1>
<key2>:rRNA</key2>
<key3>:Mt_rRNA</key3>
<key4>:tRNA</key4>
<key5>:Mt_tRNA</key5>
<key1>:rRNA</key1>
<key2>:Mt_rRNA</key2>
<key3>:tRNA</key3>
<key4>:Mt_tRNA</key4>
</abundantrna>

<!-- transcriptList parameters -->
Expand Down Expand Up @@ -78,16 +76,14 @@
<!-- oligos parameters -->
<!-- oligosParse.m -->
<oligos>
<key1>ENS\w*T\d*</key1>
<key2>ENS\w*G\d*</key2>
<number>48</number>
<seqNum>1000</seqNum>
<thres>30</thres>
<querySize>30</querySize>
<DbSize>200000</DbSize>
<blastArgs>-S 2</blastArgs>
<parallel>0</parallel>
<dir1>C:\FISHerMan\Db\Mouse.STList.fas</dir1>
<dirST>C:\FISHerMan\Db\Mouse.STList.fas</dirST>
</oligos>

<!-- adapters parameters -->
Expand Down
2 changes: 1 addition & 1 deletion Db/Mouse.transcriptList.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@

>ENSMUST00000015612

>ENSMUST00000183557
>ENSMUST00000180842

>ENSMUST00000084289
31 changes: 10 additions & 21 deletions abundantrnaParse.m
Original file line number Diff line number Diff line change
@@ -1,21 +1,9 @@
function [Header,Sequence]...
=abundantrnaParse(cdnaHeader,cdnaSequence,ncrnaHeader,ncrnaSequence,seqData,params)

% switch length(varargin)
% case 0
% seqData = [];
% params = struct('species','Mouse','verbose',1,...
% 'percent',0.001,...
% 'keys',{'ENS\w*T\d*',':rRNA',':Mt_rRNA',':tRNA',':Mt_tRNA'});
% case 1
% seqData = varargin{1};
% params = struct('species','Mouse','verbose',1,...
% 'percent',0.001,...
% 'keys',{'ENS\w*T\d*',':rRNA',':Mt_rRNA',':tRNA',':Mt_tRNA'});
% otherwise
% seqData = varargin{1};
% params = varargin{2};
% end
% params = struct('species','Mouse','verbose',1,...
% 'percent',0.001,...
% 'keys',{'ENS\w*T\d*',':rRNA',':Mt_rRNA',':tRNA',':Mt_tRNA'});

if params(1).verbose
disp('generating abundant rna database files for Blast');
Expand All @@ -35,16 +23,17 @@
end

for n = 1:length(ncrnaHeader)
if ~(isempty(pos{2,1}{n,1}) && isempty(pos{3,1}{n,1})...
&& isempty(pos{4,1}{n,1}) && isempty(pos{5,1}{n,1}))
transcriptID{end+1,1} = ncrnaHeader{n,1}(1:pos{1,1}{n,1});
if ~(isempty(pos{1,1}{n,1}) && isempty(pos{2,1}{n,1})...
&& isempty(pos{3,1}{n,1}) && isempty(pos{4,1}{n,1}))
temp=regexp(ncrnaHeader{n,1}, ':');
transcriptID{end+1,1} = ncrnaHeader{n,1}(1:temp(1)-1);
end
end
transcriptID = unique(transcriptID);

Header = vertcat(cdnaHeader,ncrnaHeader);
Sequence = vertcat(cdnaSequence,ncrnaSequence);
[Header, Sequence] = pickExpressedSeq(transcriptID, Header, Sequence, params);
[Header, Sequence] = pickExpressedSeq(transcriptID, Header, Sequence);

abundantrna = [params(1).species '.abundantrna.fas'];
if exist(abundantrna, 'file')
Expand All @@ -56,8 +45,8 @@
% MatLab's use of blastlocal requires short entry name
simpleHeader = Header;
for n = 1:length(Header)
pos = regexp(Header{n,1}, params(1).keys, 'end');
simpleHeader{n,1} = Header{n,1}(1:pos);
pos = regexp(Header{n,1}, ':');
simpleHeader{n,1} = Header{n,1}(1:pos(1)-1);
end
if exist(abundantrnaDb, 'file')
delete([abundantrnaDb '*']);
Expand Down
35 changes: 15 additions & 20 deletions cdnaParse.m
Original file line number Diff line number Diff line change
@@ -1,20 +1,8 @@
function [Header,Sequence]=cdnaParse(cdna,seqData,params)

% cdna = 'C:\OligoArray\Mouse38.cdna.fa';

% switch length(varargin)
% case 0
% seqData = [];
% params = struct('species','Mouse','verbose',1,...
% 'keys',{'ENS\w*T\d*','ENS\w*G\d*'});
% case 1
% seqData = varargin{1};
% params = struct('species','Mouse','verbose',1,...
% 'keys',{'ENS\w*T\d*','ENS\w*G\d*'});
% otherwise
% seqData = varargin{1};
% params = varargin{2};
% end
% params = struct('species','Mouse','verbose',1,...
% 'dir1','C:\FISHerMan\Db\Mouse38.cdna.fa',...
% 'keys',{'cdna','gene:\S*'});

if params(1).verbose
disp('reading the cdna data file');
Expand All @@ -34,23 +22,30 @@

for n = 1:length(Header)
temp = Header{n,1};
temp1 = temp(pos1{1,1}{n,1}:pos2{1,1}{n,1});
temp2 = temp(pos1{2,1}{n,1}:pos2{2,1}{n,1});
temp1 = temp(1:pos1{1,1}{n,1}-2);
temp2 = temp(pos1{2,1}{n,1}+5:pos2{2,1}{n,1});

if isempty(temp1)
disp('missing transcript ID');
elseif strfind(temp1,'.')
temp1pos=strfind(temp1,'.');
temp1=temp1(1:temp1pos(1)-1);
end
if isempty(temp2)
disp('missing gene ID');
elseif strfind(temp2,'.')
temp2pos=strfind(temp2,'.');
temp2=temp2(1:temp2pos(1)-1);
end

Header{n,1} = strcat(temp1, ':', temp2);
end

if ~isempty(seqData)
if params(1).verbose
disp(' picking expressed sequences according to RNA-seq data');
end
[Header, Sequence] = pickExpressedSeq(seqData, Header, Sequence, params);
[Header, Sequence] = pickExpressedSeq(seqData, Header, Sequence);
end

if params(1).verbose
Expand All @@ -71,8 +66,8 @@
% MatLab's use of blastlocal requires short entry names
simpleHeader = Header;
for n = 1:length(Header)
pos = regexp(Header{n,1}, params(1).keys, 'end');
simpleHeader{n,1} = Header{n,1}(1:pos);
pos = regexp(Header{n,1}, ':');
simpleHeader{n,1} = Header{n,1}(1:pos(1)-1);
end
if exist(cdnaDb, 'file')
delete([cdnaDb '*']);
Expand Down
2 changes: 1 addition & 1 deletion main.m
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
[probeHeader,probeSequence,probeSequence3Seg,probeSequenceCore]...
=blast1stPCR(adapterList,probeHeader,probeSequence,probeSequence3Seg,probeSequenceCore,params.onePCR);

%% Save the probes of each transcripts into individual files
%% Remove non-specific probes that will affect other synthesis steps
[probeHeader,probeSequence,probeSequence3Seg,probeSequenceCore]...
=blastOtherSteps(adapterList,probeHeader,probeSequence,probeSequence3Seg,probeSequenceCore,params.otherSteps);

Expand Down
38 changes: 18 additions & 20 deletions ncrnaParse.m
Original file line number Diff line number Diff line change
@@ -1,20 +1,9 @@
function [Header,Sequence]=ncrnaParse(ncrna,seqData,trna,params)

% ncrna = 'C:\OligoArray\Mouse38.ncrna.fa';

% switch length(varargin)
% case 0
% seqData = [];
% params = struct('species','Mouse','verbose',1,...
% 'keys',{'ENS\w*T\d*','ENS\w*G\d*','gene_biotype:\S*'});
% case 1
% seqData = varargin{1};
% params = struct('species','Mouse','verbose',1,...
% 'keys',{'ENS\w*T\d*','ENS\w*G\d*','gene_biotype:\S*'});
% otherwise
% seqData = varargin{1};
% params = varargin{2};
% end
% params = struct('species','Mouse','verbose',1,...
% 'dir1','C:\FISHerMan\Db\Mouse38.ncrna.fa',...
% 'tRNA',1,'dirT','C:\FISHerMan\Db\Mouse.trna.fas',...
% 'keys',{'ncrna','gene:\S*','gene_biotype:\S*'});

if params(1).verbose
disp('reading the ncrna data file');
Expand All @@ -34,18 +23,27 @@

for n = 1:length(Header)
temp = Header{n,1};
temp1 = temp(pos1{1,1}{n,1}:pos2{1,1}{n,1});
temp2 = temp(pos1{2,1}{n,1}:pos2{2,1}{n,1});
temp1 = temp(1:pos1{1,1}{n,1}-2);
temp2 = temp(pos1{2,1}{n,1}+5:pos2{2,1}{n,1});
temp3 = temp(pos1{3,1}{n,1}+13:pos2{3,1}{n,1});

if isempty(temp1)
disp('missing transcript ID');
elseif strfind(temp1,'.')
temp1pos=strfind(temp1,'.');
temp1=temp1(1:temp1pos(1)-1);
end
if isempty(temp2)
disp('missing gene ID');
elseif strfind(temp2,'.')
temp2pos=strfind(temp2,'.');
temp2=temp2(1:temp2pos(1)-1);
end
if isempty(temp3)
disp('missing gene type');
elseif strfind(temp3,'.')
temp3pos=strfind(temp3,'.');
temp3=temp3(1:temp3pos(1)-1);
end

Header{n,1} = strcat(temp1, ':', temp2, ':', temp3);
Expand All @@ -55,7 +53,7 @@
if params(1).verbose
disp(' picking expressed sequences according to RNA-seq data');
end
[Header, Sequence] = pickExpressedSeq(seqData, Header, Sequence, params);
[Header, Sequence] = pickExpressedSeq(seqData, Header, Sequence);
end

if ~isempty(trna)
Expand Down Expand Up @@ -89,8 +87,8 @@
% MatLab's use of blastlocal requires short entry names
simpleHeader = Header;
for n = 1:length(Header)
pos = regexp(Header{n,1}, params(1).keys, 'end');
simpleHeader{n,1} = Header{n,1}(1:pos);
pos = regexp(Header{n,1}, ':');
simpleHeader{n,1} = Header{n,1}(1:pos(1)-1);
end
if exist(ncrnaDb, 'file')
delete([ncrnaDb '*']);
Expand Down
66 changes: 17 additions & 49 deletions oligosParse.m
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
function oligos = oligosParse(params)

% oligos = 'C:\OligoArray\oligos.txt';

% if length(varargin) >= 1
% params = varargin{1};
% else
% params = struct('species','Mouse','verbose',1,...
% 'keys',{'ENS\w*T\d*','ENS\w*G\d*'},'number',48,...
% 'thres',30,'querySize',30,'DbSize',2*10^5,'seqNum',1000,...
% 'blastArgs','-S 2','parallel', 0,...
% 'specialTranscripts','C:\FISHerMan\Db\Mouse.STList.fas');
% end
% params = struct('species','Mouse','verbose',1,...
% 'number',48,'seqNum',1000,'thres',30,'querySize',30,...
% 'DbSize',2*10^5,'blastArgs','-S 2','parallel', 0,...
% 'specialTranscripts','C:\FISHerMan\Db\Mouse.STList.fas');

if params(1).verbose
disp('reading the result file from OligoArray');
end

oligos = [params(1).species '.tempoligos.txt'];
if ~exist(oligos, 'file')
warning('missing important files from OligoArray');
end

if params(1).verbose
disp('reading the result file from OligoArray');
end

fid = fopen(oligos,'r');
fmt = '%s %f %f %f %f %f %f %s %s %s %*[^\n]';
temp = textscan(fid,fmt,'CollectOutput',true,'delimiter','\t','TreatAsEmpty','NA');
Expand All @@ -39,14 +32,16 @@
if params(1).verbose && mod(n, 1000) == 1
disp([' analyzing oligo entry no. ' num2str(n)]);
end
[pos1, pos2] = regexp(nonspecificHits{n,1}, params(2).keys, 'start', 'end');

pos = regexp(geneNames{n,1}, ':');
geneName=geneNames{n,1}(pos(1)+1:end);

flag = 0;
for m = 1:length(pos1)
if ~strfind(geneNames{n,1}, nonspecificHits{n,1}(pos1(m):pos2(m)))
flag = 1;
end
if length(regexp(nonspecificHits{n,1}, geneName)) < ...
length(regexp(nonspecificHits{n,1}, ':'))
flag = 1;
end

if flag == 1
index = [index n];
end
Expand All @@ -66,33 +61,6 @@
specificHits{n,1} = seqrcomplement(specificHits{n,1});
end

%% Remove transcripts without enough oligos
% if params.verbose
% disp('removing transcripts without enough oligos');
% end

% pos = regexp(geneNames, params(1).keys, 'end');
% trimNames = {};
% for n = 1:length(geneNames)
% trimNames{end+1} = geneNames{n,1}(1:pos{n,1});
% end
% trimNames = trimNames';
% uniqueNames = unique(trimNames, 'stable');
%
% indexTotal = zeros(length(trimNames),1);
% for n = 1:length(uniqueNames)
% index = ismember(trimNames, uniqueNames{n,1});
% if sum(index) < params.number
% indexTotal = indexTotal+index;
% disp(['transcript ' uniqueNames{n,1} ' has less than ' num2str(params.number) ' probes']);
% end
% end
%
% indexTotal = logical(indexTotal);
% geneNames(indexTotal) = [];
% nonspecificHits(indexTotal) = [];
% specificHits(indexTotal) = [];

%% Blast oligos against abundant rna database and remove non-specific oligos
[geneNames,specificHits,nonspecificHits]...
=blastAbundantRNASimple(geneNames,specificHits,nonspecificHits,params);
Expand All @@ -103,10 +71,10 @@
disp('removing transcripts without enough oligos');
end

pos = regexp(geneNames, params(1).keys, 'end');
pos = regexp(geneNames, ':');
trimNames = {};
for n = 1:length(geneNames)
trimNames{end+1} = geneNames{n,1}(1:pos{n,1});
trimNames{end+1} = geneNames{n,1}(1:pos{n,1}(1)-1);
end
trimNames = trimNames';
uniqueNames = unique(trimNames, 'stable');
Expand Down
Loading

0 comments on commit ae60d7e

Please sign in to comment.