From d4b9161f8334bcff96ee2a61ef0f9003797fb810 Mon Sep 17 00:00:00 2001 From: Massimiliano Fasi Date: Thu, 30 May 2024 15:38:26 +0100 Subject: [PATCH] Add support for saturation arithmetic --- mex/cpfloat.c | 68 ++++-- mex/cpfloat.m | 16 +- src/cpfloat_binary32.h | 2 +- src/cpfloat_binary64.h | 2 +- src/cpfloat_definitions.h | 39 +++- src/cpfloat_template.h | 61 +++--- test/cpfloat_test.m | 207 +++++++++++++------ test/cpfloat_test.ts | 425 +++++++++++++++++++++----------------- 8 files changed, 512 insertions(+), 308 deletions(-) diff --git a/mex/cpfloat.c b/mex/cpfloat.c index 85eec0c..2249531 100644 --- a/mex/cpfloat.c +++ b/mex/cpfloat.c @@ -41,11 +41,14 @@ void mexFunction(int nlhs, fpopts->precision = 11; fpopts->emin = -14; fpopts->emax = 15; - fpopts->subnormal = CPFLOAT_SUBN_USE; fpopts->explim = CPFLOAT_EXPRANGE_TARG; fpopts->round = CPFLOAT_RND_NE; + fpopts->saturation = CPFLOAT_SAT_NO; + fpopts->subnormal = CPFLOAT_SUBN_USE; + fpopts->flip = CPFLOAT_SOFTERR_NO; fpopts->p = 0.5; + fpopts->bitseed = NULL; fpopts->randseedf = NULL; fpopts->randseed = NULL; @@ -54,6 +57,7 @@ void mexFunction(int nlhs, /* Parse second argument and populate fpopts structure. */ if (nrhs > 1) { bool is_subn_rnd_default = false; + bool is_saturation_default = false; if(!mxIsEmpty(prhs[1]) && !mxIsStruct(prhs[1])) { mexErrMsgIdAndTxt("cpfloat:invalidstruct", "Second argument must be a struct."); @@ -62,7 +66,7 @@ void mexFunction(int nlhs, if (tmp != NULL) { if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) - /* Use default format, for compatibility with chop. */ + /* Set default format, for compatibility with chop. */ strcpy(fpopts->format, "h"); else if (mxGetClassID(tmp) == mxCHAR_CLASS) strcpy(fpopts->format, mxArrayToString(tmp)); @@ -80,6 +84,7 @@ void mexFunction(int nlhs, fpopts->precision = 4; fpopts->emin = -6; fpopts->emax = 8; + is_saturation_default = true; } else if (!strcmp(fpopts->format, "q52") || !strcmp(fpopts->format, "fp8-e5m2") || !strcmp(fpopts->format, "E5M2")) { @@ -161,6 +166,31 @@ void mexFunction(int nlhs, else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) fpopts->round = *((double *)mxGetData(tmp)); } + tmp = mxGetField(prhs[1], 0, "saturation"); + if (tmp != NULL) { + if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) + fpopts->saturation = CPFLOAT_SAT_NO; + else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) + fpopts->saturation = *((double *)mxGetData(tmp)); + } else { + if (is_saturation_default) + fpopts->saturation = CPFLOAT_SAT_USE; /* Default for E4M3. */ + else + fpopts->saturation = CPFLOAT_SAT_NO; + } + tmp = mxGetField(prhs[1], 0, "subnormal"); + if (tmp != NULL) { + if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) + fpopts->subnormal = CPFLOAT_SUBN_USE; + else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) + fpopts->subnormal = *((double *)mxGetData(tmp)); + } else { + if (is_subn_rnd_default) + fpopts->subnormal = CPFLOAT_SUBN_RND; /* Default for bfloat16. */ + else + fpopts->subnormal = CPFLOAT_SUBN_USE; + } + tmp = mxGetField(prhs[1], 0, "flip"); if (tmp != NULL) { if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) @@ -288,10 +318,11 @@ void mexFunction(int nlhs, /* Allocate and return second output. */ if (nlhs > 1) { - const char* field_names[] = {"format", "params", "subnormal", "round", - "flip", "p", "explim"}; + const char* field_names[] = {"format", "params", "explim", + "round", "saturation", "subnormal", + "flip", "p"}; mwSize dims[2] = {1, 1}; - plhs[1] = mxCreateStructArray(2, dims, 7, field_names); + plhs[1] = mxCreateStructArray(2, dims, 8, field_names); mxSetFieldByNumber(plhs[1], 0, 0, mxCreateString(fpopts->format)); mxArray *outparams = mxCreateDoubleMatrix(1,3,mxREAL); @@ -301,30 +332,35 @@ void mexFunction(int nlhs, outparamsptr[2] = fpopts->emax; mxSetFieldByNumber(plhs[1], 0, 1, outparams); - mxArray *outsubnormal = mxCreateDoubleMatrix(1,1,mxREAL); - double *outsubnormalptr = mxGetData(outsubnormal); - outsubnormalptr[0] = fpopts->subnormal; - mxSetFieldByNumber(plhs[1], 0, 2, outsubnormal); + mxArray *outexplim = mxCreateDoubleMatrix(1, 1, mxREAL); + double *outexplimptr = mxGetData(outexplim); + outexplimptr[0] = fpopts->explim; + mxSetFieldByNumber(plhs[1], 0, 2, outexplim); mxArray *outround = mxCreateDoubleMatrix(1,1,mxREAL); double *outroundptr = mxGetData(outround); outroundptr[0] = fpopts->round; mxSetFieldByNumber(plhs[1], 0, 3, outround); + mxArray *outsaturation = mxCreateDoubleMatrix(1,1,mxREAL); + double *outsaturationptr = mxGetData(outsaturation); + outsaturationptr[0] = fpopts->saturation; + mxSetFieldByNumber(plhs[1], 0, 4, outsaturation); + + mxArray *outsubnormal = mxCreateDoubleMatrix(1,1,mxREAL); + double *outsubnormalptr = mxGetData(outsubnormal); + outsubnormalptr[0] = fpopts->subnormal; + mxSetFieldByNumber(plhs[1], 0, 5, outsubnormal); + mxArray *outflip = mxCreateDoubleMatrix(1,1,mxREAL); double *outflipptr = mxGetData(outflip); outflipptr[0] = fpopts->flip; - mxSetFieldByNumber(plhs[1], 0, 4, outflip); + mxSetFieldByNumber(plhs[1], 0, 6, outflip); mxArray *outp = mxCreateDoubleMatrix(1,1,mxREAL); double *outpptr = mxGetData(outp); outpptr[0] = fpopts->p; - mxSetFieldByNumber(plhs[1], 0, 5, outp); - - mxArray *outexplim = mxCreateDoubleMatrix(1,1,mxREAL); - double *outexplimptr = mxGetData(outexplim); - outexplimptr[0] = fpopts->explim; - mxSetFieldByNumber(plhs[1], 0, 6, outexplim); + mxSetFieldByNumber(plhs[1], 0, 7, outp); } if (nlhs > 2) diff --git a/mex/cpfloat.m b/mex/cpfloat.m index 4f1dfa3..4ea0196 100644 --- a/mex/cpfloat.m +++ b/mex/cpfloat.m @@ -39,11 +39,6 @@ % the target format, respectively. The default value of this field is % the vector [11,-14,15]. % -% * The scalar FPOPTS.subnormal specifies the support for subnormal numbers. -% The target floating-point format will not support subnormal numbers if -% this field is set to 0, and will support them otherwise. The default value -% for this field is 0 if the target format is 'bfloat16' and 1 otherwise. -% % * The scalar FPOPTS.explim specifies the support for an extended exponent % range. The target floating-point format will have the exponent range of % the storage format ('single' or 'double', depending on the class of X) if @@ -63,6 +58,17 @@ % Any other value results in no rounding. The default value for this field % is 1. % +% * The scalar FPOPTS.saturation specifies whether saturation arithmetic is in +% use. On overflow, the target floating-point format will use the largest +% representable floating-point if this field is set to 0, and infinity +% otherwise. The default value for this field is 1 if the target format is +% 'E4M3' and 1 otherwise. + +% * The scalar FPOPTS.subnormal specifies the support for subnormal numbers. +% The target floating-point format will not support subnormal numbers if +% this field is set to 0, and will support them otherwise. The default value +% for this field is 0 if the target format is 'bfloat16' and 1 otherwise. +% % * The scalar FPOPTS.flip specifies whether the function should simulate the % occurrence of a single bit flip striking the floating-point representation % of elements of Y. Possible values are: diff --git a/src/cpfloat_binary32.h b/src/cpfloat_binary32.h index 3ecbe83..9ce1843 100644 --- a/src/cpfloat_binary32.h +++ b/src/cpfloat_binary32.h @@ -289,8 +289,8 @@ static inline int cpf_fmaf(float *X, const float *A, const float *B, #define INTSUFFIX U #define DEFPREC 24 -#define DEFEMAX 127 #define DEFEMIN -126 +#define DEFEMAX 127 #define NLEADBITS 9 #define NBITS 32 #define FULLMASK 0xFFFFFFFFU diff --git a/src/cpfloat_binary64.h b/src/cpfloat_binary64.h index 5df0a6b..fa57f63 100644 --- a/src/cpfloat_binary64.h +++ b/src/cpfloat_binary64.h @@ -289,8 +289,8 @@ static inline int cpf_fma(double *X, const double *A, const double *B, #define INTTYPE uint64_t #define INTSUFFIX ULL #define DEFPREC 53 -#define DEFEMAX 1023 #define DEFEMIN -1022 +#define DEFEMAX 1023 #define NLEADBITS 12 #define NBITS 64 #define FULLMASK 0xFFFFFFFFFFFFFFFFULL diff --git a/src/cpfloat_definitions.h b/src/cpfloat_definitions.h index 009963d..3c049ff 100644 --- a/src/cpfloat_definitions.h +++ b/src/cpfloat_definitions.h @@ -10,6 +10,7 @@ * * + @ref cpfloat_explim_t, * + @ref cpfloat_rounding_t, + * + @ref cpfloat_saturation_t, * + @ref cpfloat_softerr_t, * + @ref cpfloat_subnormal_t, * @@ -88,6 +89,16 @@ typedef enum { CPFLOAT_NO_RND = 8, } cpfloat_rounding_t; +/** + * @brief Saturation modes available in CPFloat. + */ +typedef enum { + /** Use standard arithmetic. */ + CPFLOAT_SAT_NO = 0, + /** Use saturation arithmetic. */ + CPFLOAT_SAT_USE = 1, +} cpfloat_saturation_t; + /** * @brief Soft fault simulation modes available in CPFloat. */ @@ -214,14 +225,6 @@ typedef struct { * exponent is larger than the maximum allowed by the storage format. */ cpfloat_exponent_t emax; - /** - * @brief Support for subnormal numbers in target format. - * - * @details Subnormal numbers are supported if this field is set to - * `CPFLOAT_SUBN_USE` and rounded to a normal number using the current - * rounding mode if it is set to `CPFLOAT_SUBN_RND`. - */ - cpfloat_subnormal_t subnormal; /** * @brief Support for extended exponents in target format. * @@ -256,6 +259,24 @@ typedef struct { * those in the list above is specified. */ cpfloat_rounding_t round; + /** + * @brief Support for subnormal numbers in target format. + * + * @details Subnormal numbers are supported if this field is set to + * `CPFLOAT_SUBN_USE` and rounded to a normal number using the current + * rounding mode if it is set to `CPFLOAT_SUBN_RND`. + */ + cpfloat_saturation_t saturation; + /** + * @brief Support for subnormal numbers in target format. + * + * @details Subnormal numbers are supported if this field is set to + * `CPFLOAT_SUBN_USE` and rounded to a normal number using the current + * rounding mode if it is set to `CPFLOAT_SUBN_RND`. + */ + cpfloat_subnormal_t subnormal; + + /* Bit flips. */ /** * @brief Support for soft errors. * @@ -281,6 +302,8 @@ typedef struct { * contain a number in the interval [0,1]. */ double p; + + /* Internal: state of pseudo-random number generator. */ /** * @brief Internal state of pseudo-random number generator for single bits. * diff --git a/src/cpfloat_template.h b/src/cpfloat_template.h index 426f445..28cf71c 100644 --- a/src/cpfloat_template.h +++ b/src/cpfloat_template.h @@ -158,11 +158,13 @@ typedef union { #define FPPARAMS ADDSUFFIXTO(fpparams) typedef struct { cpfloat_precision_t precision; - cpfloat_exponent_t emax; cpfloat_exponent_t emin; - cpfloat_subnormal_t subnormal; + cpfloat_exponent_t emax; cpfloat_rounding_t round; + cpfloat_saturation_t saturation; + cpfloat_subnormal_t subnormal; FPTYPE ftzthreshold; + FPTYPE ofvalue; FPTYPE xmin; FPTYPE xmax; FPTYPE xbnd; @@ -292,40 +294,43 @@ static inline FPPARAMS COMPUTE_GLOBAL_PARAMS(const optstruct *fpopts, /* Actual precision and exponent range. */ *retval = 0; cpfloat_precision_t precision = fpopts->precision; - cpfloat_exponent_t emax = fpopts->explim == CPFLOAT_EXPRANGE_TARG ? - fpopts->emax : - DEFEMAX; cpfloat_exponent_t emin = fpopts->explim == CPFLOAT_EXPRANGE_TARG ? fpopts->emin : DEFEMIN; + cpfloat_exponent_t emax = fpopts->explim == CPFLOAT_EXPRANGE_TARG ? + fpopts->emax : + DEFEMAX; if (precision > DEFPREC) { precision = DEFPREC; *retval = 1; } - if (emax > DEFEMAX) { - emax = DEFEMAX; + if (emin < DEFEMIN) { + emin = DEFEMIN; *retval = 2; } - if (emin < DEFEMIN) { - emin = DEFEMIN; + if (emax > DEFEMAX) { + emax = DEFEMAX; *retval = 2; } FPTYPE xmin = ldexp(1., emin); /* Smallest pos. normal. */ FPTYPE xmins = ldexp(1., emin-precision+1); /* Smallest pos. subnormal. */ FPTYPE ftzthreshold = (fpopts->subnormal == CPFLOAT_SUBN_USE) ? xmins : xmin; + FPTYPE xmax = ldexp(1., emax) * (2-ldexp(1., 1-precision)); FPTYPE xbnd = ldexp(1., emax) * (2-ldexp(1., -precision)); + FPTYPE ofvalue = (fpopts->saturation == CPFLOAT_SAT_USE) ? xmax : INFINITY; /* Bitmasks. */ INTTYPE leadmask = FULLMASK << (DEFPREC-precision); /* To keep. */ INTTYPE trailmask = leadmask ^ FULLMASK; /* To discard. */ - FPPARAMS params = {precision, emax, emin, fpopts->subnormal, fpopts->round, - ftzthreshold, xmin, xmax, xbnd, + FPPARAMS params = {precision, emin, emax, fpopts->round, + fpopts->saturation, fpopts->subnormal, + ftzthreshold, ofvalue, xmin, xmax, xbnd, leadmask, trailmask, NULL, NULL}; return params; @@ -401,7 +406,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ } else if (ABS(y) >= p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ *(x) = FPOF((INTOF(y) + \ @@ -425,7 +430,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y)); \ } else if (ABS(y) > p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ *(x) = FPOF((INTOF(y) + (lp->trailmask>>1)) & lp->leadmask); \ @@ -450,7 +455,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ } else if (ABS(y) >= p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ INTTYPE LSB = ((INTOF(y) >> (DEFPREC-lp->precision)) & INTCONST(1)) \ @@ -477,11 +482,11 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y) > 0 ? p->ftzthreshold : 0; \ } else if (ABS(y) > p->xmax) { /* Overflow */ \ if (*(y) > p->xmax) \ - *(x) = INFINITY; \ - else if (*(y) < -p->xmax && *(y) != -INFINITY) \ + *(x) = p->ofvalue; \ + else if (*(y) < -p->xmax && *(y) != -p->ofvalue) \ *(x) = -p->xmax; \ else /* *(y) == -INFINITY */ \ - *(x) = -INFINITY; \ + *(x) = -p->ofvalue; \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ if (SIGN(y) == 0) /* Add ulp if x is positive. */ \ @@ -509,11 +514,11 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y) >= 0 ? 0 : -p->ftzthreshold; \ } else if (ABS(y) > p->xmax) { /* Overflow */ \ if (*(y) < -p->xmax) \ - *(x) = -INFINITY; \ - else if (*(y) > p->xmax && *(y) != INFINITY) \ + *(x) = -p->ofvalue; \ + else if (*(y) > p->xmax && *(y) != p->ofvalue) \ *(x) = p->xmax; \ else /* *(y) == INFINITY */ \ - *(x) = INFINITY; \ + *(x) = p->ofvalue; \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ if (SIGN(y)) /* Subtract ulp if x is positive. */ \ @@ -532,7 +537,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, #define RD_TWD_ZERO_SCALAR_OTHER_EXP(x, y, p, lp) \ if (ABS(y) < p->ftzthreshold) { /* Underflow */ \ *(x) = FPOF(SIGN(y)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->xmax)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ @@ -581,7 +586,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y); \ } \ if (ABS(x) >= p->xbnd) /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); /* Stochastic rounding with equal probabilities. */ #define RS_EQUI_SCALAR(x, y, p, lp) \ @@ -589,9 +594,9 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, if (ABS(y) < p->ftzthreshold && *(y) != 0) { /* Underflow */ \ randombit = GENBIT(p->BITSEED); \ *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? p->ftzthreshold : 0)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ randombit = GENBIT(p->BITSEED); \ - *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? INFINITY : p->xmax)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? p->ofvalue : p->xmax)); \ } else if ((INTOF(y) & lp->trailmask)) { /* Not exactly representable. */ \ randombit = GENBIT(p->BITSEED); \ *(x) = FPOF(INTOF(y) & lp->leadmask); \ @@ -604,7 +609,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, #define RO_SCALAR(x, y, p, lp) \ if (ABS(y) < p->ftzthreshold && *(y) != 0) { /* Underflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->xmax)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ @@ -651,7 +656,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, numelem, p, lp) \ PARALLEL_STRING(PARALLEL) \ { \ - if (p->emax == DEFEMAX) { \ + if (p->emax == DEFEMAX && p->saturation == CPFLOAT_SAT_NO) { \ FOR_STRING(PARALLEL) \ for (size_t i=0; i0,true); - d = cpfloat(c',fp); assert_eq(norm(d-c',1)>0,true); + d = cpfloat(c,fp); + assert_eq(norm(d-c,1)>0,true); + d = cpfloat(c',fp); + assert_eq(norm(d-c',1)>0,true); fp.p = 0; % No bits flipped. - d = cpfloat(c,fp); assert_eq(d,d); + d = cpfloat(c,fp); + assert_eq(d,d); fp.p = 1; % All bits flipped. - d = cpfloat(c,fp); assert_eq(all(d ~= c),true); + d = cpfloat(c,fp); + assert_eq(all(d ~= c),true); clear cpfloat [~,fp] = cpfloat; assert_eq(fp.subnormal,1) assert_eq(fp.format,'h') - [c,options] = cpfloat(pi); + [~,options] = cpfloat(pi); assert_eq(options.format,'h') assert_eq(options.subnormal,1) assert_eq(options.round,1) @@ -91,7 +117,7 @@ clear fp fp.format = 'd'; - [c,options] = cpfloat(pi,fp); + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'d') assert_eq(options.subnormal,1) assert_eq(options.params, [53 -1022 1023]) @@ -101,7 +127,19 @@ assert_eq(fp.params, [53 -1022 1023]) clear fp - fp.format = 'bfloat16'; [c,options] = cpfloat(pi,fp); + fp.format = 'E4M3'; + [~,options] = cpfloat(pi,fp); + assert_eq(options.format,'E4M3') + assert_eq(options.saturation,1) + assert_eq(options.params, [4 -6 8]) + [~,fp] = cpfloat; + assert_eq(fp.format,'E4M3') + assert_eq(fp.saturation,1) + assert_eq(fp.params, [4 -6 8]) + + clear fp + fp.format = 'bfloat16'; + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'bfloat16') assert_eq(options.subnormal,0) assert_eq(options.params, [8 -126 127]) @@ -114,7 +152,8 @@ [~,fp] = cpfloat; fp.format = 'b'; fp = rmfield(fp, 'params'); - [c,options] = cpfloat(pi,fp); + [~,options] = cpfloat(pi,fp); + assert_eq(options.saturation,0) % No saturation if that field was empty. assert_eq(options.subnormal,1) % No subnormals only if that field was empty. % Check these usages do not give an error. @@ -134,7 +173,8 @@ B = A + randn(size(A))*1e-12; C = cpfloat(B,options); assert_eq(A,C); - A2 = hilb(6); C = cpfloat(A2); + A2 = hilb(6); + C = cpfloat(A2); options.format = 'c'; options.params = [8 -126 127]; % bfloat16 @@ -152,7 +192,7 @@ [X1,opt] = cpfloat(A,options); [X2,opt2] = cpfloat(A,options2); assert_eq(X1,X2) - % assert_eq(cpfloat(A,options),cpfloat(A,options2)); + assert_eq(cpfloat(A,opt),cpfloat(A,opt2)); % Row vector clear options @@ -185,14 +225,14 @@ elseif i == 2 % Half precision tests. [u,xmins,xmin,xmax,p,emins,emin,emax] = float_params('half'); - options.format = 'h' + options.format = 'h'; elseif i == 3 % Quarter precision tests. [u,xmins,xmin,xmax,p,emins,emin,emax] = float_params('q43'); options.format = 'E4M3'; % Modification for OCP compliant q43. emin = -6; % Previously thought to be 1-emax=-7. - emax = 8; % Previously thought to be 7 + emax = 8; % Previously thought to be 7 emins = emin + 1 - p; % Exponent of smallest subnormal number. xmins = 2^emins; xmin = 2^emin; @@ -282,7 +322,19 @@ assert_eq(c,x) end + % Saturation tests. + options.saturation = 1; + for j = 1:6 + options.round = j; + x = inf; + c = cpfloat(x,options); + assert_eq(c,xmax) + c = cpfloat(-x,options); + assert_eq(c,-xmax) + end + % Infinities tests. + options.saturation = 0; for j = 1:6 options.round = j; x = inf; @@ -345,14 +397,16 @@ end options.subnormal = 1; - c = cpfloat(xmin,options); assert_eq(c,xmin) + c = cpfloat(xmin,options); + assert_eq(c,xmin) x = [xmins xmin/2 xmin 0 xmax 2*xmax 1-delta/5 1+delta/4]; c = cpfloat(x,options); c_expected = [x(1:5) inf 1 1]; assert_eq(c,c_expected) options.subnormal = 0; - c = cpfloat(xmin,options); assert_eq(c,xmin) + c = cpfloat(xmin,options); + assert_eq(c,xmin) x = [xmins xmin/2 xmin 0 xmax 2*xmax 1-delta/5 1+delta/4]; c = cpfloat(x,options); c_expected = [0 0 x(3:5) inf 1 1]; @@ -409,12 +463,24 @@ % Do not limit exponent. options.explim = 0; - x = xmin/2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmin/2; c = cpfloat(x,options); assert_eq(c,x) - x = xmax*2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmax*2; c = cpfloat(x,options); assert_eq(c,x) - x = xmins/2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmins/2; c = cpfloat(x,options); assert_eq(c,x) + x = xmin/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmin/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = xmax*2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmax*2; + c = cpfloat(x,options); + assert_eq(c,x) + x = xmins/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmins/2; + c = cpfloat(x,options); + assert_eq(c,x) A = [pi -pi; pi -pi]; C = cpfloat(A,options); options.explim = 1; @@ -498,21 +564,32 @@ % Test rounding with CHOPFAST versus native rounding. options.format = 's'; - m = 100; y = zeros(3,n); z = y; + m = 100; + y = zeros(3,n); + z = y; for i = 1:m x = randn; - options.round = 2; y(i,1) = cpfloat(x,options); - options.round = 3; y(i,2) = cpfloat(x,options); - options.round = 4; y(i,3) = cpfloat(x,options); + options.round = 2; + y(i,1) = cpfloat(x,options); + options.round = 3; + y(i,2) = cpfloat(x,options); + options.round = 4; + y(i,3) = cpfloat(x,options); if usingoctave - fesetround(inf); z(i,1) = single(x); - fesetround(-inf); z(i,2) = single(x); - fesetround(0); z(i,3) = single(x); + fesetround(inf); + z(i,1) = single(x); + fesetround(-inf); + z(i,2) = single(x); + fesetround(0); + z(i,3) = single(x); else % Use undocumented function to set rounding mode in MATLAB. - feature('setround',inf), z(i,1) = single(x); - feature('setround',-inf), z(i,2) = single(x); - feature('setround',0), z(i,3) = single(x); + feature('setround',inf); + z(i,1) = single(x); + feature('setround',-inf); + z(i,2) = single(x); + feature('setround',0); + z(i,3) = single(x); end end assert_eq(y,z) @@ -533,9 +610,15 @@ c = cpfloat(x,options); assert_eq(c,[0 0 x(3:4)]) - options.format = 'd'; options.subnormal = 0; cpfloat([],options); - a = cpfloat(pi); assert_eq(a,pi) - options.format = 'd'; options.subnormal = 1; cpfloat([],options); + options.format = 'd'; + options.subnormal = 0; + cpfloat([],options); + a = cpfloat(pi); + assert_eq(a,pi) + + options.format = 'd'; + options.subnormal = 1; + cpfloat([],options); a = cpfloat(pi); assert_eq(a,pi) x = pi^2; @@ -563,8 +646,10 @@ yd = cpfloat(pd); assert_eq(double(ys),yd) - options.format = 'h'; options.round = 2; - as = single(rand(n,1)); ad = double(as); + options.format = 'h'; + options.round = 2; + as = single(rand(n,1)); + ad = double(as); delta = single(rand(n,1)); cd = cpfloat(ad + 1e-5*double(delta),options); cs = cpfloat(as + 1e-5*delta,options); @@ -581,21 +666,21 @@ % Test base 2 logarithm options.format = 'h'; options.round = 4; - x = single(2^-3 * (sum(2.^(-[0:23])))); - assert_eq(cpfloat(x,options), single(2^-3 * (sum(2.^(-[0:10]))))) + x = single(2^-3 * (sum(2.^(-(0:23))))); + assert_eq(cpfloat(x,options), single(2^-3 * (sum(2.^(-(0:10)))))) - x = 2^-3 * (sum(2.^(-[0:52]))); - assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-[0:10])))) + x = 2^-3 * (sum(2.^(-(0:52)))); + assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-(0:10))))) options.format = 's'; - x = single(2^-3 * (sum(2.^(-[0:23])))); + x = single(2^-3 * (sum(2.^(-(0:23))))); assert_eq(cpfloat(x,options), x) - x = 2^-3 * (sum(2.^(-[0:52]))); - assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-[0:23])))) + x = 2^-3 * (sum(2.^(-(0:52)))); + assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-(0:23))))) options.format = 'd'; - x = 2^-3 * (sum(2.^(-[0:52]))); + x = 2^-3 * (sum(2.^(-(0:52)))); assert_eq(cpfloat(x,options), x) options.round = 1; diff --git a/test/cpfloat_test.ts b/test/cpfloat_test.ts index be83030..df68de5 100644 --- a/test/cpfloat_test.ts +++ b/test/cpfloat_test.ts @@ -53,10 +53,13 @@ optstruct *fpopts; void fpopts_setup(void) { fpopts = malloc(sizeof(optstruct)); - fpopts->flip = 0; - fpopts->p = 0.5; fpopts->round = 1; fpopts->explim = CPFLOAT_EXPRANGE_STOR; + fpopts->saturation = CPFLOAT_SAT_NO; + + fpopts->flip = 0; + fpopts->p = 0.5; + fpopts->bitseed = NULL; fpopts->randseedf = NULL; fpopts->randseed = NULL; @@ -163,7 +166,7 @@ void init_intarray_rounding_double(uint64_t *x, size_t n, uint64_t first, uint64_t step) { x[0] = first + 1ul; x[1] = first + (step >> 1); - for (size_t i = 2; i < n; i+=3) { + for (size_t i = 2; i < n; i += 3) { first += step; x[i] = first - 1ul; if (i+1 < n) @@ -178,7 +181,7 @@ void init_intarray_rounding_float(uint32_t *x, size_t n, uint32_t first, uint32_t step) { x[0] = first + 1u; x[1] = first + (step >> 1); - for (size_t i = 2; i < n; i+=3) { + for (size_t i = 2; i < n; i += 3) { first += step; x[i] = first - 1u; if (i+1 < n) @@ -209,13 +212,13 @@ void init_fparray_rounding_double(double *x, size_t n, double first, double step) { x[0] = FPOFd(INTOFd(first) + 1lu); x[1] = first + step/2; - for (size_t i = 2; i < n; i+=3) { + for (size_t i = 2; i < n; i += 3) { first += step; x[i] = FPOFd(INTOFd(first) - 1lu); if (i+1 < n) - x[i+1] = FPOFd(INTOFd(first) + 1lu); + x[i+1] = FPOFd(INTOFd(first) + 1lu); if (i+2 < n) - x[i+2] = first + step/2; + x[i+2] = first + step/2; } } @@ -224,13 +227,13 @@ void init_fparray_rounding_float(float *x, size_t n, float first, float step) { x[0] = FPOFf(INTOFf(first) + 1lu); x[1] = first + step/2; - for (size_t i = 2; i < n; i+=3) { + for (size_t i = 2; i < n; i += 3) { first += step; x[i] = FPOFf(INTOFf(first) - 1lu); if (i+1 < n) - x[i+1] = FPOFf(INTOFf(first) + 1lu); + x[i+1] = FPOFf(INTOFf(first) + 1lu); if (i+2 < n) - x[i+2] = first + step/2; + x[i+2] = first + step/2; } } @@ -368,7 +371,7 @@ void check_equality_double_long_long(double *x, long long *y, size_t n) { static inline void check_equality_float(float *x, float *y, size_t n) { for (size_t j = 0; j < n; j++) { - if (!nan_safe_compare_float(x[j], y[j])) { + if (!nan_safe_compare_float(x[j], y[j])) { printf("FLOAT\n"); printf("***\nj = %ld\nx = %23.15e [%X]\ny = %23.15e\n", j, x[j], *(uint32_t *)(x + j), (float)y[j]); @@ -458,12 +461,12 @@ void check_array_float(float *y, float *x, float *ref, "pr = %d\n" "e = %d\n" "s = %d\n", - j, - x[j], * (uint32_t *)(x+j), - y[j], * (uint32_t *)(y+j), - ref[j], * (uint32_t *)(ref+j), - fpopts->round, fpopts->precision, - fpopts->emax, fpopts->subnormal); + j, + x[j], * (uint32_t *)(x+j), + y[j], * (uint32_t *)(y+j), + ref[j], * (uint32_t *)(ref+j), + fpopts->round, fpopts->precision, + fpopts->emax, fpopts->subnormal); } ck_assert(nan_safe_compare_float(y[j], ref[j])); } @@ -593,7 +596,8 @@ void select_tests_det_double(double *y, double *x, double *ref, fpopts->round = round; for (format = minformat; format <= maxformat; format++) { fpopts->precision = precision[format]; - fpopts->emax = emax[format]; fpopts->emin = emin[format]; + fpopts->emax = emax[format]; + fpopts->emin = emin[format]; for (subnormal = minsubnormal; subnormal <= maxsubnormal; subnormal++) { fpopts->subnormal = subnormal; for (explim = minexplim; explim <= maxexplim; explim++) { @@ -625,7 +629,8 @@ void select_tests_det_float(float *y, float *x, float *ref, fpopts->round = round; for (format = minformat; format <= maxformat; format++) { fpopts->precision = precision[format]; - fpopts->emax = emax[format]; fpopts->emin = emin[format]; + fpopts->emax = emax[format]; + fpopts->emin = emin[format]; for (subnormal = minsubnormal; subnormal <= maxsubnormal; subnormal++) { fpopts->subnormal = subnormal; for (explim = minexplim; explim <= maxexplim; explim++) { @@ -657,7 +662,8 @@ void select_tests_stoc_double(double *tmpin, double *tmpout, fpopts->round = mode; for (format = minformat; format <= maxformat; format++) { fpopts->precision = precision[format]; - fpopts->emax = emax[format]; fpopts->emin = emin[format]; + fpopts->emax = emax[format]; + fpopts->emin = emin[format]; for (subnormal = minsubnormal; subnormal <= maxsubnormal; subnormal++) { fpopts->subnormal = subnormal; for (explim = minexplim; explim <= maxexplim; explim++) { @@ -692,7 +698,8 @@ void select_tests_stoc_float(float *tmpin, float *tmpout, fpopts->round = mode; for (format = minformat; format <= maxformat; format++) { fpopts->precision = precision[format]; - fpopts->emax = emax[format]; fpopts->emin = emin[format]; + fpopts->emax = emax[format]; + fpopts->emin = emin[format]; for (subnormal = minsubnormal; subnormal <= maxsubnormal; subnormal++) { fpopts->subnormal = subnormal; for (explim = minexplim; explim <= maxexplim; explim++) { @@ -709,25 +716,25 @@ void select_tests_stoc_float(float *tmpin, float *tmpout, double *alloc_init_array_double(double *x, size_t n) { double *y = malloc(n * sizeof(*y)); - for (size_t i=0; iprecision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = ldexp(1.,fpopts->precision-1) - 1; // number of subnormals double *xd = malloc(n * sizeof(*xd)); init_fparray_double(xd, n, minsubnormal(fpopts), minsubnormal(fpopts)); @@ -962,10 +970,11 @@ for (size_t mode=1; mode<3; mode++) { /* Exactly representable normals. */ #test no_rounding_normal_numbers printf("1c. No rounding: normal numbers\n"); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = ldexp(1., fpopts->precision-1) * 2 * fpopts->emax; uint64_t *xd = malloc(n * sizeof(*xd)); init_intarray_double(xd, n, intminnormal_double(fpopts), @@ -1014,12 +1023,13 @@ double *refxmind = malloc(n * sizeof(*refxmind)); float *xf = malloc(n * sizeof(*xf)); float *refzerof = calloc(n, sizeof(*refzerof)); float *refxminf = malloc(n * sizeof(*refxminf)); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { double *yd = allocate_array_double(xd, n, mode); float *yf = allocate_array_float(xf, n, mode); for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmin = minnormal(fpopts); double snxmin = minsubnormal(fpopts); double halfxmin = xmin / 2; @@ -1164,12 +1174,13 @@ double *refxmind = malloc(n * sizeof(*refxmind)); float *xf = malloc(n * sizeof(*xf)); float *refzerof = calloc(n, sizeof(*refzerof)); float *refxminf = malloc(n * sizeof(*refxminf)); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { double *yd = allocate_array_double(xd, n, mode); float *yf = allocate_array_float(xf, n, mode); for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmin = minsubnormal(fpopts); double halfxmin = xmin / 2; double xd_imm [] = {nextafter(0, INFINITY), @@ -1308,145 +1319,171 @@ double *refinfd = malloc(n * sizeof(*refinfd)); float *xf = malloc(n * sizeof(*xf)); float *refxmaxf = malloc(n * sizeof(*refxmaxf)); float *refinff = malloc(n * sizeof(*refinff)); -for (size_t mode=1; mode<3; mode++) { - double *yd = allocate_array_double(xd, n, mode); - float *yf = allocate_array_float(xf, n, mode); - for (size_t i = 0; i < nformats; i++) { - fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; - double xmax = maxnormal(fpopts); - double xbound = maxbound(fpopts); - double xd_imm [] = {nextafter(xmax, INFINITY), - nextafter(xbound, 0), - xbound, - nextafter(xbound, INFINITY)}; - double refd [] = {xmax, xmax, inf_double(), inf_double()}; - for (size_t j = 0; j < n; j++) { - refxmaxd[j] = xmax; - refinfd[j] = inf_double(); - } - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, -1, -1, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, 1, 1, i, i, -1, 1); - refd[2] = xmax; - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, 0, 0, i, i, -1, 1); - refd[2] = inf_double(); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refinfd, n, fpopts, 2, 2, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 3, 4, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 7, 7, i, i, -1, 1); - csign_intarray_double((uint64_t *)xd_imm, n); - csign_intarray_double((uint64_t *)refd, n); - csign_intarray_double((uint64_t *)refxmaxd, n); - csign_intarray_double((uint64_t *)refinfd, n); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, -1, -1, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, 1, 1, i, i, -1, 1); - refd[2] = -xmax; - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refd, n, fpopts, 0, 0, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 2, 2, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refinfd, n, fpopts, 3, 3, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 4, 4, i, i, -1, 1); - reset_array_double(yd, n, mode); - copy_array_double(xd, (double *)&xd_imm, n); - select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 7, 7, i, i, -1, 1); - - float xf_imm [] = {nextafterf(xmax, INFINITY), - nextafterf(xbound, 0), - xbound, - nextafterf(xbound, INFINITY)}; - float reff [] = {xmax, xmax, inf_float(), inf_float()}; - for (size_t j = 0; j < n; j++) { - refxmaxf[j] = xmax; - refinff[j] = inf_float(); +for (size_t saturation = 0; saturation < 2; saturation++) { + fpopts->saturation = saturation; + for (size_t mode = 1; mode < 3; mode++) { + double *yd = allocate_array_double(xd, n, mode); + float *yf = allocate_array_float(xf, n, mode); + for (size_t i = 0; i < nformats; i++) { + fpopts->precision = precision[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; + double xmax = maxnormal(fpopts); + double xbound = maxbound(fpopts); + double xd_imm [] = {nextafter(xmax, INFINITY), + nextafter(xbound, 0), + xbound, + nextafter(xbound, INFINITY)}; + double refd [] = {xmax, xmax, inf_double(), inf_double()}; + if (saturation == 0) { + for (size_t j = 0; j < n; j++) { + refxmaxd[j] = xmax; + refinfd[j] = inf_double(); + } + } else { + for (size_t j = 0; j < n; j++) { + refd[2] = xmax; + refd[3] = xmax; + refxmaxd[j] = xmax; + refinfd[j] = xmax; + } + } + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, -1, -1, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, 1, 1, i, i, -1, 1); + refd[2] = xmax; + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, 0, 0, i, i, -1, 1); + if (saturation == 0) + refd[2] = inf_double(); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refinfd, n, fpopts, 2, 2, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 3, 4, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 7, 7, i, i, -1, 1); + csign_intarray_double((uint64_t *)xd_imm, n); + csign_intarray_double((uint64_t *)refd, n); + csign_intarray_double((uint64_t *)refxmaxd, n); + csign_intarray_double((uint64_t *)refinfd, n); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, -1, -1, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, 1, 1, i, i, -1, 1); + refd[2] = -xmax; + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refd, n, fpopts, 0, 0, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 2, 2, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refinfd, n, fpopts, 3, 3, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 4, 4, i, i, -1, 1); + reset_array_double(yd, n, mode); + copy_array_double(xd, (double *)&xd_imm, n); + select_tests_det_double(yd, xd, refxmaxd, n, fpopts, 7, 7, i, i, -1, 1); + + float xf_imm [] = {nextafterf(xmax, INFINITY), + nextafterf(xbound, 0), + xbound, + nextafterf(xbound, INFINITY)}; + float reff [] = {xmax, xmax, inf_float(), inf_float()}; + if (saturation == 0) { + for (size_t j = 0; j < n; j++) { + refxmaxf[j] = xmax; + refinff[j] = inf_float(); + } + } else { + reff[2] = xmax; + reff[3] = xmax; + for (size_t j = 0; j < n; j++) { + refxmaxf[j] = xmax; + refinff[j] = xmax; + } + } + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, -1, -1, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, 1, 1, i, i, -1, 1); + reff[2] = xmax; + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, 0, 0, i, i, -1, 1); + if (saturation == 0) + reff[2] = inf_float(); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refinff, n, fpopts, 2, 2, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 3, 4, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 7, 7, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + csign_intarray_float((uint32_t *)xf_imm, n); + csign_intarray_float((uint32_t *)reff, n); + csign_intarray_float((uint32_t *)refxmaxf, n); + csign_intarray_float((uint32_t *)refinff, n); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, -1, -1, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, 1, 1, i, i, -1, 1); + reff[2] = -xmax; + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, reff, n, fpopts, 0, 0, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 2, 2, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refinff, n, fpopts, 3, 3, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 4, 4, i, i, -1, 1); + reset_array_float(yf, n, mode); + copy_array_float(xf, (float *)&xf_imm, n); + select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 7, 7, i, i, -1, 1); } - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, -1, -1, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, 1, 1, i, i, -1, 1); - reff[2] = xmax; - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, 0, 0, i, i, -1, 1); - reff[2] = inf_float(); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refinff, n, fpopts, 2, 2, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 3, 4, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 7, 7, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - csign_intarray_float((uint32_t *)xf_imm, n); - csign_intarray_float((uint32_t *)reff, n); - csign_intarray_float((uint32_t *)refxmaxf, n); - csign_intarray_float((uint32_t *)refinff, n); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, -1, -1, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, 1, 1, i, i, -1, 1); - reff[2] = -xmax; - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, reff, n, fpopts, 0, 0, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 2, 2, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refinff, n, fpopts, 3, 3, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 4, 4, i, i, -1, 1); - reset_array_float(yf, n, mode); - copy_array_float(xf, (float *)&xf_imm, n); - select_tests_det_float(yf, xf, refxmaxf, n, fpopts, 7, 7, i, i, -1, 1); + free_array_double(yd, mode); + free_array_float(yf, mode); } - free_array_double(yd, mode); - free_array_float(yf, mode); -} + } free(xd); free(refxmaxd); free(refinfd); free(xf); free(refxmaxf); free(refinff); +fpopts->saturation = CPFLOAT_SAT_NO; /* Rounding of numbers in the subnormal range. */ #test deterministic_rounding_subnormal_numbers printf("2d. Deterministic rounding: subnormal numbers\n"); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = 3 * ldexp(1.,fpopts->precision-1) - 1; double *xd_imm = malloc(n * sizeof(*xd_imm)); double *xd = malloc(n * sizeof(*xd)); @@ -1472,7 +1509,7 @@ for (size_t mode=1; mode<3; mode++) { -1, -1, i, i, 1, 1); csign_intarray_double((uint64_t *)xd_imm, n); csign_intarray_double((uint64_t *)refd, n); - for (size_t j = 1; j < n; j+=3) + for (size_t j = 1; j < n; j += 3) refd[j] -= stepd; reset_array_double(yd, n, mode); copy_array_double(xd, xd_imm, n); @@ -1488,7 +1525,7 @@ for (size_t mode=1; mode<3; mode++) { csign_intarray_double((uint64_t *)refd, n); reset_array_double(yd, n, mode); copy_array_double(xd, xd_imm, n); - for (size_t j = 1; j < n; j+=6) + for (size_t j = 1; j < n; j += 6) refd[j] += stepd; reset_array_double(yd, n, mode); copy_array_double(xd, xd_imm, n); @@ -1573,7 +1610,7 @@ for (size_t mode=1; mode<3; mode++) { -1, -1, i, i, 1, 1); csign_intarray_float((uint32_t *)xf_imm, n); csign_intarray_float((uint32_t *)reff, n); - for (size_t j = 1; j < n; j+=3) + for (size_t j = 1; j < n; j += 3) reff[j] -= stepf; reset_array_float(yf, n, mode); copy_array_float(xf, xf_imm, n); @@ -1587,7 +1624,7 @@ for (size_t mode=1; mode<3; mode++) { 0, 0, i, i, 1, 1); csign_intarray_float((uint32_t *)xf_imm, n); csign_intarray_float((uint32_t *)reff, n); - for (size_t j = 1; j < n; j+=6) + for (size_t j = 1; j < n; j += 6) reff[j] += stepf; reset_array_float(yf, n, mode); copy_array_float(xf, xf_imm, n); @@ -1655,10 +1692,11 @@ for (size_t mode=1; mode<3; mode++) { /* Rounding of numbers in the normal range. */ #test deterministic_rounding_normal_numbers printf("2e. Deterministic rounding: normal numbers\n"); -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = 3 * (ldexp(1., fpopts->precision-1) * 2 * fpopts->emax - 1); uint64_t *xd_imm = malloc(n * sizeof(*xd_imm)); uint64_t *refd = malloc(n * sizeof(*refd)); @@ -1684,7 +1722,7 @@ for (size_t mode=1; mode<3; mode++) { -1, -1, i, i, -1, -1); csign_intarray_double(xd_imm, n); csign_intarray_double(refd, n); - for (size_t j = 1; j < n; j+=3) + for (size_t j = 1; j < n; j += 3) refd[j] -= stepd; reset_array_double(yd, n, mode); copy_array_double(xd, (double *)xd_imm, n); @@ -1698,7 +1736,7 @@ for (size_t mode=1; mode<3; mode++) { 0, 0, i, i, -1, -1); csign_intarray_double(xd_imm, n); csign_intarray_double(refd, n); - for (size_t j = 4; j < n; j+=6) + for (size_t j = 4; j < n; j += 6) refd[j] += stepd; reset_array_double(yd, n, mode); copy_array_double(xd, (double *)xd_imm, n); @@ -1785,7 +1823,7 @@ for (size_t mode=1; mode<3; mode++) { -1, -1, i, i, -1, -1); csign_intarray_float(xf_imm, n); csign_intarray_float(reff, n); - for (size_t j = 1; j < n; j+=3) + for (size_t j = 1; j < n; j += 3) reff[j] -= stepf; reset_array_float(yf, n, mode); copy_array_float(xf, (float *)xf_imm, n); @@ -1799,7 +1837,7 @@ for (size_t mode=1; mode<3; mode++) { 0, 0, i, i, -1, -1); csign_intarray_float(xf_imm, n); csign_intarray_float(reff, n); - for (size_t j = 4; j < n; j+=6) + for (size_t j = 4; j < n; j += 6) reff[j] += stepf; reset_array_float(yf, n, mode); copy_array_float(xf, (float *)xf_imm, n); @@ -1878,11 +1916,12 @@ double *tmpdout = malloc(NREPS * sizeof(*tmpdout)); float *tmpfin = malloc(NREPS * sizeof(*tmpfin)); float *tmpfout = malloc(NREPS * sizeof(*tmpfout)); double proundequi = 0.50; -for (size_t mode=1; mode<3; mode++) { +for (size_t mode = 1; mode < 3; mode++) { for(subnormal = 0; subnormal <= 1; subnormal++) { for(size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmin; if (subnormal) xmin = minsubnormal(fpopts); @@ -1929,11 +1968,12 @@ double *tmpdout = malloc(NREPS * sizeof(*tmpdout)); float *tmpfin = malloc(NREPS * sizeof(*tmpfin)); float *tmpfout = malloc(NREPS * sizeof(*tmpfout)); double proundequi = 0.50; -for (size_t mode=1; mode<3; mode++ ) { +for (size_t mode = 1; mode < 3; mode++ ) { for(subnormal = 0; subnormal <= 1; subnormal++) { for(i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmax = maxnormal(fpopts); double xbound = maxbound(fpopts); double *yd = malloc(n * sizeof(*yd)); @@ -2009,10 +2049,11 @@ double *tmpdout = malloc(NREPS * sizeof(*tmpdout)); float *tmpfin = malloc(NREPS * sizeof(*tmpfin)); float *tmpfout = malloc(NREPS * sizeof(*tmpfout)); double proundequi = 0.50; -for (size_t mode=1; mode<3; mode++ ) { +for (size_t mode = 1; mode < 3; mode++ ) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = 3 * ldexp(1.,fpopts->precision-1) - 1; double *xd = malloc(n * sizeof(*xd)); double stepd = minsubnormal(fpopts); @@ -2061,12 +2102,13 @@ double *tmpdout = malloc(NREPS * sizeof(*tmpdout)); float *tmpfin = malloc(NREPS * sizeof(*tmpfin)); float *tmpfout = malloc(NREPS * sizeof(*tmpfout)); double proundequi = 0.50; -for (size_t mode=1; mode<3; mode++ ) { +for (size_t mode = 1; mode < 3; mode++ ) { for (subnormal = 0; subnormal <= 1; subnormal++) { for (explim = 0; explim <= 1; explim++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = 3 * (ldexp(1., fpopts->precision-1) * 2 * fpopts->emax - 1); uint64_t *xd = malloc(n * sizeof(*xd)); @@ -2139,7 +2181,8 @@ size_t n_roundings = 7; for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; for (size_t i = 0; i < n_roundings; i++) { fpopts->round = det_roundings[i]; @@ -2272,7 +2315,8 @@ float *reff = malloc(n * sizeof(* xf)); for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; for (size_t i = 0; i < n_roundings; i++) { fpopts->round = det_roundings[i]; // Univariate functions. @@ -2413,7 +2457,8 @@ const int resd_fpclassify [] = {FP_NORMAL, FP_NORMAL,FP_NORMAL, FP_SUBNORMAL, for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; double xmind = minnormal(fpopts); double xminds = minsubnormal(fpopts); double ad[] = {-2., -1, -xmind, -xminds, xminds, xmind, 1., CONST_SQRT2, @@ -2505,7 +2550,8 @@ const int resf_fpclassify [] = {FP_NORMAL, FP_NORMAL,FP_NORMAL, for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; float xminf = minnormal(fpopts); float xminfs = minsubnormal(fpopts); float af[] = { -2., -1, -xminf, xminf, 1., CONST_SQRT2, @@ -2588,10 +2634,11 @@ printf("4c. Next and previous floating-point number\n"); fpopts->explim = CPFLOAT_EXPRANGE_TARG; fpopts->subnormal = CPFLOAT_SUBN_USE; // Subnormals -for (size_t mode=2; mode<3; mode++) { +for (size_t mode = 2; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = (ldexp(1.,fpopts->precision-1) - 1) + 2; double *ad = malloc(n * sizeof(*ad)); @@ -2688,10 +2735,11 @@ for (size_t mode=2; mode<3; mode++) { } // Normals -for (size_t mode=2; mode<3; mode++) { +for (size_t mode = 2; mode < 3; mode++) { for (size_t i = 0; i < nformats; i++) { fpopts->precision = precision[i]; - fpopts->emax = emax[i]; fpopts->emin = emin[i]; + fpopts->emax = emax[i]; + fpopts->emin = emin[i]; size_t n = ldexp(1., fpopts->precision-1) * 2 * fpopts->emax + 2; double *ad = malloc(n * sizeof(*ad)); @@ -2972,7 +3020,8 @@ check_equality_double_long_long(rd7, xll, n-3); printf("4d. Arithmetic operations on large arrays\n"); size_t i = 0; fpopts->precision = precision[i]; -fpopts->emax = emax[i]; fpopts->emin = emin[i]; +fpopts->emax = emax[i]; +fpopts->emin = emin[i]; size_t n = 3 * (ldexp(1., fpopts->precision-1) * (ldexp(1., 5) - 2) - 1) - 2; double *ad = malloc(n * sizeof(*ad)); double *bd = malloc(n * sizeof(*bd));