-
Notifications
You must be signed in to change notification settings - Fork 0
/
squid.h
352 lines (308 loc) · 12.3 KB
/
squid.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
/* SQUID - A C function library for biological sequence analysis
* Copyright (C) 1992-1995 Sean R. Eddy
*
* This source code is distributed under terms of the
* GNU General Public License. See the files COPYING
* and GNULICENSE for further details.
*
*/
#ifndef SQUIDH_INCLUDED
#define SQUIDH_INCLUDED
/* squid.h
* last modified Sun Aug 15 12:05:58 1993
*
* Header file for my library of sequence functions.
*
*/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
/* Library version info is made available as a global to
* any interested program. These are defined in iupac.c
* with the other globals.
*/
extern char squid_version[];
extern char squid_date[];
/****************************************************
* Error codes returned by squid library functions
****************************************************/
#define SQERR_OK 0 /* no error */
#define SQERR_UNKNOWN 1 /* generic error, unidentified */
#define SQERR_NODATA 2 /* unexpectedly NULL stream */
#define SQERR_MEM 3 /* malloc or realloc failed */
#define SQERR_NOFILE 4 /* file not found */
#define SQERR_FORMAT 5 /* file format not recognized */
#define SQERR_PARAMETER 6 /* bad parameter passed to func */
#define SQERR_DIVZERO 7 /* error in sre_math.c */
extern int squid_errno;
/****************************************************
* Single sequence information
****************************************************/
struct seqinfo_s {
int flags; /* what extra data are available */
char name[32]; /* up to 31 characters of name */
char id[32]; /* up to 31 char of database identifier */
char acc[32]; /* up to 31 char of database accession # */
char desc[128]; /* up to 127 char of description */
int len; /* length of this seq */
int start; /* (1..len) start position on source seq */
int stop; /* (1..len) end position on source seq */
int olen; /* original length of source seq */
int type; /* kRNA, kDNA, kAmino, or kOther */
float weight; /* weight on sequence */
char *ss; /* 0..len-1 secondary structure string */
char *free; /* free text, unparsed comments */
char *sa; /* 0..len-1 % side chain surface access. */
};
typedef struct seqinfo_s SQINFO;
#define SQINFO_NAME (1 << 0)
#define SQINFO_ID (1 << 1)
#define SQINFO_ACC (1 << 2)
#define SQINFO_DESC (1 << 3)
#define SQINFO_START (1 << 4)
#define SQINFO_STOP (1 << 5)
#define SQINFO_LEN (1 << 6)
#define SQINFO_TYPE (1 << 7)
#define SQINFO_WGT (1 << 8)
#define SQINFO_OLEN (1 << 9)
#define SQINFO_SS (1 << 10)
#define SQINFO_FREE (1 << 11)
#define SQINFO_SA (1 << 12)
/****************************************************
* Sequence i/o: originally from Don Gilbert's readseq
****************************************************/
/* buffer size for reading in lines from sequence files*/
#define LINEBUFLEN 4096
/* sequence types parsed by Seqtype() */
#define kOtherSeq 0
#define kDNA 1
#define kRNA 2
#define kAmino 3
/* Sequence file formats recognized */
#define kUnknown 0 /* format not determinable */
#define kIG 1
#define kGenBank 2
#define kNBRF 3
#define kEMBL 4
#define kGCG 5
#define kStrider 6
#define kPearson 7
#define kZuker 8
#define kIdraw 9 /* idraw-style PostScript (write only) */
#define kSelex 10 /* my flat text alignment format */
#define kMSF 11 /* GCG MSF multiple alignment format */
#define kPIR 12 /* PIR-CODATA format */
#define kRaw 13 /* unformatted, raw sequence (output only) */
#define kSquid 14 /* my sequence database format */
#define kMinFormat 1 /* SRE: kUnknown doesn't count */
#define kMaxFormat 14
#define kNumFormats (kMaxFormat + 1)
#define kNoformat -1 /* format not tested */
struct ReadSeqVars {
FILE *f;
char sbuffer[LINEBUFLEN]; /* current line we're working on */
int seqlen;
int maxseq;
int dash_equals_n; /* a hack - affects EMBL reading, to deal with EMBL */
char *seq;
SQINFO *sqinfo; /* name, id, etc. */
char *sp;
};
typedef struct ReadSeqVars SQFILE;
/****************************************************
* Database indexing (GSI index file format)
****************************************************/
/* A GSI (generic sequence index) file is composed of
* recnum + nfiles + 1 records. Each record contains
* three fields; key, file number, and disk offset.
* Record 0 contains:
* [ "GSI" ] [ nfiles ] [ recnum ]
* Records 1..nfiles map file names to file numbers, and contain:
* [ filename ] [ file number, 1..nfiles ] [ 0 (unused) ]
* Records nfiles+1 to recnum+nfiles+1 provide disk offset
* and file number indices for every key:
* [ key ] [ file number ] [ offset]
*/
struct gsi_s {
FILE *gsifp; /* open GSI index file */
long recnum; /* number of records */
short nfiles; /* number of files */
};
typedef struct gsi_s GSIFILE;
/* Used for GSI (general sequence index) files, for rapid fetching
* from databases. A GSI record contains:
* [ key name] [file number] [disk offset]
*/
#define GSI_RECSIZE (32 * sizeof(char) + sizeof(short) + sizeof(long))
#define GSI_KEYSIZE (32 * sizeof(char))
/****************************************************
* Sequence alphabet: see also iupac.c
****************************************************/
/* IUPAC symbols defined globally in iupac.c */
struct iupactype {
char sym; /* character representation */
char symcomp; /* complement (regular char */
char code; /* my binary rep */
char comp; /* binary encoded complement */
};
extern struct iupactype iupac[];
#define IUPACSYMNUM 17
extern char *stdcode1[]; /* 1-letter amino acid translation code */
extern char *stdcode3[]; /* 3-letter amino acid translation code */
extern double aafq[]; /* amino acid occurrence frequencies */
extern char aa_alphabet[]; /* amino acid alphabet */
extern int aa_index[]; /* convert 0..19 indices to 0..26 */
/* valid symbols in IUPAC code */
#define NUCLEOTIDES "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
#define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY"
#define DNA_ALPHABET "ACGT"
#define RNA_ALPHABET "ACGU"
#define WHITESPACE " \t\n"
#define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-')
/****************************************************
* Alignment information
****************************************************/
/* Structure: aliinfo_s
*
* Purpose: Optional information returned from an alignment file.
*
* flags: always used. Flags for which info is valid/alloced.
*
* alen: always returned. Alignments are always flushed right
* with gaps so that all aseqs are the same length, alen.
* Available for all alignment formats.
*
* cs: 0..alen-1, just like the alignment. Contains single-letter
* secondary structure codes for consensus structure; "<>^+"
* for RNA, "EHL." for protein. May be NULL if unavailable
* from seqfile. Only available for SELEX format files.
*
* rf: 0..alen-1, just like the alignment. rf is an arbitrary string
* of characters, used for annotating columns. Blanks are
* interpreted as non-canonical columns and anything else is
* considered canonical. Only available from SELEX format files.
*
* sqinfo: always returned. Array of 0..nseq-1
* per-sequence information structures, carrying
* name, id, accession, coords, and weight.
*
*/
struct aliinfo_s {
int flags; /* flags for what info is valid */
int alen; /* length of alignment (columns) */
char au[64]; /* "author" information */
char *cs; /* consensus secondary structure string */
char *rf; /* reference coordinate system */
struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */
};
typedef struct aliinfo_s AINFO;
#define AINFO_ALEN (1 << 0)
#define AINFO_AUTH (1 << 1)
#define AINFO_CS (1 << 2)
#define AINFO_RF (1 << 3)
/****************************************************
* Cluster analysis and phylogenetic tree support
****************************************************/
/* struct phylo_s - a phylogenetic tree
*
* For N sequences, there will generally be an array of 0..N-2
* phylo_s structures. [0] is the root. The indexes of left and
* right children are somewhat confusing so be careful. The
* indexes can have values of 0..2N-2. If they are 0..N-1, they
* represent pointers to individual sequences. If they are
* >= N, they represent pointers to a clustree_s structure
* at (index - N).
*/
struct phylo_s {
int parent; /* index of parent, N..2N-2, or -1 for root */
int left; /* index of one of the branches, 0..2N-2 */
int right; /* index of other branch, 0..2N-2 */
float diff; /* difference score between seqs */
float lblen; /* left branch length */
float rblen; /* right branch length */
char *is_in; /* 0..N flag array, 1 if seq included */
int incnum; /* number of seqs included at this node */
};
/* Strategies for cluster analysis; cluster by mean distance,
* minimum distance, or maximum distance.
*/
enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN };
/****************************************************
* Generic data structure support
****************************************************/
/* a struct intstack_s implements a pushdown stack for storing
* single integers.
*/
struct intstack_s {
int data;
struct intstack_s *nxt;
};
/****************************************************
* Binary nucleotide alphabet support
****************************************************/
/* Binary encoding of the IUPAC code for nucleotides
*
* four-bit "word", permitting rapid degenerate matching
* A C G T/U
* 0 0 1 0
*/
#define NTA 8
#define NTC 4
#define NTG 2
#define NTT 1
#define NTU 1
#define NTN 15 /* A|C|G|T */
#define NTR 10 /* A|G */
#define NTY 5 /* C|T */
#define NTM 12 /* A|C */
#define NTK 3 /* G|T */
#define NTS 6 /* C|G */
#define NTW 9 /* A|T */
#define NTH 13 /* A|C|T */
#define NTB 7 /* C|G|T */
#define NTV 14 /* A|C|G */
#define NTD 11 /* A|G|T */
#define NTGAP 16 /* GAP */
#define NTEND 0 /* null string terminator */
/* ntmatch(): bitwise comparison of two nuc's
* note that it's sensitive to the order;
* probe may be degenerate but target should not be
*/
#define ntmatch(probe, target) ((probe & target) == target)
/****************************************************
* Miscellaneous macros and defines
****************************************************/
#define CHOOSE(a) ((int) (sre_random() * (a)))
#define INT_SWAP(a,b) {int foo; foo = b; b = a; a = foo;}
#define Free2DArray(ptr, n) \
{ int fooidx;\
if (ptr != NULL) { \
for (fooidx = 0; fooidx < (n); fooidx++) if (ptr[fooidx] != NULL) free(ptr[fooidx]);\
free(ptr);\
} }
#define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7)
#ifndef MIN
#define MIN(a,b) ((a<b)?a:b)
#endif
#ifndef MAX
#define MAX(a,b) ((a>b)?a:b)
#endif
#ifndef TRUE
#define TRUE 1
#endif
#ifndef True
#define True 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#ifndef False
#define False 0
#endif
/* someday, Sun Microsystems will conform to ANSI... */
#ifndef EXIT_SUCCESS
#define EXIT_SUCCESS 0
#define EXIT_FAILURE 1
#endif
#include "sqfuncs.h" /* squid function declarations */
#endif /* SQUIDH_INCLUDED */