-
Notifications
You must be signed in to change notification settings - Fork 13
/
GFaSeqGet.h
311 lines (286 loc) · 10.4 KB
/
GFaSeqGet.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#ifndef GFASEQGET_H
#define GFASEQGET_H
#include "GFastaIndex.h"
#define MAX_FASUBSEQ 0x20000000
//max 512MB sequence data held in memory at a time
class GSubSeq {
public:
uint sqstart; //1-based coord of subseq start on sequence
uint sqlen; //length of subseq loaded
char* sq; //actual subsequence data will be stored here
// (with end-of-line characters removed)
/*char* xseq; //the exposed pointer to the last requested subsequence start
off_t xstart; //the coordinate start for the last requested subseq
off_t xlen; //the last requested subseq len*/
GSubSeq() {
sqstart=0;
sqlen=0;
sq=NULL;
/* xseq=NULL;
xstart=0;
xlen=0;*/
}
void forget() { //forget about pointer data, so we can reuse it
sq=NULL;
sqstart=0;
sqlen=0;
}
~GSubSeq() {
GFREE(sq);
}
// genomic, 1-based coordinates:
void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0);
//check for overlap with previous window and realloc/extend appropriately
//returns offset from seq that corresponds to sstart
// the window will keep extending until MAX_FASUBSEQ is reached
};
//
class GFaSeqGet {
char* fname; //file name where the sequence resides
FILE* fh;
off_t fseqstart; //file offset where the sequence actually starts
uint seq_len; //total sequence length, if known (when created from GFastaIndex)
uint line_len; //length of each line of text
uint line_blen; //binary length of each line
// = line_len + number of EOL character(s)
GSubSeq* lastsub;
void initialParse(off_t fofs=0, bool checkall=true);
const char* loadsubseq(uint cstart, int& clen);
void finit(const char* fn, off_t fofs, bool validate);
public:
//GStr seqname; //current sequence name
char* seqname;
GFaSeqGet(): fname(NULL), fh(NULL), fseqstart(0), seq_len(0),
line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) {
}
GFaSeqGet(const char* fn, off_t fofs, bool validate=false):fname(NULL), fh(NULL),
fseqstart(0), seq_len(0), line_len(0), line_blen(0),
lastsub(NULL), seqname(NULL) {
finit(fn,fofs,validate);
}
GFaSeqGet(const char* fn, bool validate=false):fname(NULL), fh(NULL),
fseqstart(0), seq_len(0), line_len(0), line_blen(0),
lastsub(NULL), seqname(NULL) {
finit(fn,0,validate);
}
GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen);
//constructor from GFastaIndex record
GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
~GFaSeqGet() {
if (fname!=NULL) {
GFREE(fname);
fclose(fh);
}
GFREE(seqname);
delete lastsub;
}
const char* seq(uint cstart=1, int clen=0) {
int cend = clen==0 ? 0 : cstart+clen-1;
return getRange(cstart, cend);
}
const char* subseq(uint cstart, int& clen);
const char* getRange(uint cstart=1, uint cend=0) {
if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ;
if (cstart>cend) { Gswap(cstart, cend); }
int clen=cend-cstart+1;
//int rdlen=clen;
return subseq(cstart, clen);
}
//caller is responsible for deallocating the return string
char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
//uncached, read and return allocated buffer
//caller is responsible for deallocating the return string
char* fetchSeq(int* retlen=NULL) {
int clen=(seq_len>0) ? seq_len : MAX_FASUBSEQ;
if (lastsub) { delete lastsub; lastsub=NULL; }
subseq(1, clen);
if (retlen) *retlen=clen;
char* r=lastsub->sq;
lastsub->forget();
if (clen>0) {
r[clen]=0;
}
else {
r=NULL;
}
return r;
}
void loadall(uint32 max_len=0) {
//TODO: better read the whole sequence differently here - line by line
//so when EOF or another '>' line is found, the reading stops!
int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ);
subseq(1, clen);
}
void load(uint cstart, uint cend) {
//cache as much as possible
if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request
int clen=cend-cstart+1;
subseq(cstart, clen);
}
int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
int getseqlen() { return seq_len; } //known when loaded with GFastaIndex
off_t getseqofs() { return fseqstart; }
int getLineLen() { return line_len; }
int getLineBLen() { return line_blen; }
//reads a subsequence starting at genomic coordinate cstart (1-based)
};
//multi-fasta sequence handling
class GFastaDb {
public:
char* fastaPath;
GFastaIndex* faIdx; //could be a cdb .cidx file
//int last_fetchid;
const char* last_seqname;
GFaSeqGet* faseq;
//GCdbYank* gcdb;
GFastaDb(const char* fpath=NULL, bool forceIndexFile=true):fastaPath(NULL), faIdx(NULL), last_seqname(NULL),
faseq(NULL) {
//gcdb=NULL;
init(fpath, forceIndexFile);
}
void init(const char* fpath, bool writeIndexFile=true) {
if (fpath==NULL || fpath[0]==0) return;
//last_fetchid=-1;
last_seqname=NULL;
if (!fileExists(fpath))
GError("Error: file/directory %s does not exist!\n",fpath);
fastaPath=Gstrdup(fpath);
//GStr gseqpath(fpath);
if (fileExists(fastaPath)>1) { //exists and it's not a directory
char* fainame=Gstrdup(fastaPath,4);
int fainamelen=strlen(fainame);
//int fainame_len=strlen(fainame);
if (trimSuffix(fastaPath, ".fai")) {
//.fai index file given directly
if (!fileExists(fastaPath))
GError("Error: cannot find fasta file for index %s !\n", fastaPath);
}
else { //append .fai as needed
strcpy(fainame+fainamelen, ".fai");
fainamelen+=4;
}
//GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars());
faIdx=new GFastaIndex(fastaPath, fainame);
char* fainamecwd=fainame; //will hold just the file name without the path
char* plast=strrchr(fainamecwd, '/');
if (plast!=NULL) {
fainamecwd=plast+1; //point to the file name only
}
if (!faIdx->hasIndex()) { //could not load index file .fai
//try current directory (Warning: might not be the correct index for that file!)
if (plast==NULL) {
if (fileExists(fainamecwd)>1) {
faIdx->loadIndex(fainamecwd);
}
}
} //tried to load index
if (!faIdx->hasIndex()) { //no index file to be loaded, build the index
//if (forceIndexFile)
// GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
faIdx->buildIndex(); //build index in memory only
if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
if (writeIndexFile) {
//GMessage("Fasta index rebuilt.\n");
FILE* fcreate=fopen(fainame, "w");
char* idxfname=fainame;
if (fcreate==NULL) {
GMessage("Warning: cannot create fasta index file %s! (permissions?)\n", fainame);
if (fainame!=fainamecwd) {
//try cwd
idxfname=fainamecwd;
GMessage(" Attempting to create the index in the current directory..\n");
if ((fcreate=fopen(fainamecwd, "w"))==NULL)
GError("Error: cannot create fasta index file %s!\n", fainamecwd);
}
}
if (fcreate!=NULL) {
if (faIdx->storeIndex(fcreate)<faIdx->getCount())
GMessage("Warning: error writing the index file %s!\n", idxfname);
else GMessage("FASTA index file %s created.\n", idxfname);
}
} //file storage of index requested
} //creating FASTA index
GFREE(fainame);
} //multi-fasta file
}
GFaSeqGet* fetchFirst(const char* fname, bool checkFasta=false) {
faseq=new GFaSeqGet(fname, checkFasta);
faseq->loadall();
//last_fetchid=gseq_id;
GFREE(last_seqname);
last_seqname=Gstrdup(faseq->seqname);
return faseq;
}
char* getFastaFile(const char* gseqname) {
if (fastaPath==NULL) return NULL;
int gnl=strlen(gseqname);
char* s=Gstrdup(fastaPath, gnl+8);
int slen=strlen(s);
if (s[slen-1]!='/') {//CHPATHSEP ?
s[slen]='/';
slen++;
s[slen]='\0';
}
//s.append(gseqname);
strcpy(s+slen, gseqname);
slen+=gnl;
if (!fileExists(s)) {
//s.append(".fa")
strcpy(s+slen, ".fa");
slen+=3;
}
if (!fileExists(s)) { strcpy(s+slen, "sta"); slen+=3; }
if (fileExists(s)) return Gstrdup(s);
else {
GMessage("Warning: cannot find genomic sequence file %s/%s{.fa,.fasta}\n",fastaPath, s);
return NULL;
}
GFREE(s);
}
GFaSeqGet* fetch(const char* gseqname) {
if (fastaPath==NULL) return NULL;
if (last_seqname!=NULL && (strcmp(gseqname, last_seqname)==0)
&& faseq!=NULL) return faseq;
delete faseq;
faseq=NULL;
//last_fetchid=-1;
GFREE(last_seqname);
last_seqname=NULL;
//char* gseqname=GffObj::names->gseqs.getName(gseq_id);
if (faIdx!=NULL) { //fastaPath was the multi-fasta file name and it must have an index
GFastaRec* farec=faIdx->getRecord(gseqname);
if (farec!=NULL) {
faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
farec->line_len, farec->line_blen);
faseq->loadall(); //just cache the whole sequence, it's faster
//last_fetchid=gseq_id;
last_seqname=Gstrdup(gseqname);
}
else {
GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
return NULL;
}
}
else { //directory with FASTA files named as gseqname
char* sfile=getFastaFile(gseqname);
if (sfile!=NULL) {
faseq=new GFaSeqGet(sfile);
faseq->loadall();
//last_fetchid=gseq_id;
GFREE(sfile);
}
} //one fasta file per contig
//else GMessage("Warning: fasta index not available, cannot retrieve sequence %s\n",
// gseqname);
return faseq;
}
~GFastaDb() {
GFREE(fastaPath);
GFREE(last_seqname);
//delete gcdb;
delete faIdx;
delete faseq;
}
};
GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, const char* seqid);
#endif