Skip to content

Commit a492616

Browse files
committed
Faster base conversion
1 parent 5809141 commit a492616

32 files changed

+1283
-207
lines changed

demo/test.c

Lines changed: 101 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,19 @@
44
#define S_MP_RAND_JENKINS_C
55
#include "s_mp_rand_jenkins.c"
66

7+
/* TODO: Make it an environment variable via main.yml?
8+
This is for testing only, so no reason to add checks to the build process. */
9+
#ifdef __has_include
10+
# if __has_include (<valgrind/valgrind.h>)
11+
# include <valgrind/valgrind.h>
12+
# else
13+
# define RUNNING_ON_VALGRIND 1
14+
# endif
15+
#else
16+
# define RUNNING_ON_VALGRIND 1
17+
#endif
18+
19+
720
static long rand_long(void)
821
{
922
long x;
@@ -1157,30 +1170,27 @@ static int test_mp_montgomery_reduce(void)
11571170

11581171
}
11591172

1173+
#include <time.h>
11601174
static int test_mp_read_radix(void)
11611175
{
11621176
char buf[4096];
1163-
size_t written;
1177+
size_t written, maxlen;
11641178

1165-
mp_int a;
1166-
DOR(mp_init_multi(&a, NULL));
1179+
int bignum, i, j, k, limit_test;
1180+
char *buffer, *bcpy, *startb;
1181+
clock_t start, stop, t_slow, t_fast;
1182+
double slow = 0.0, fast = 0.0, sum_slow = 0.0, sum_fast = 0.0;
1183+
double s_bases_slow[65] = {0.0};
1184+
double s_bases_fast[65] = {0.0};
1185+
1186+
mp_int a, b, c;
1187+
DOR(mp_init_multi(&a, &b, &c, NULL));
11671188

11681189
DO(mp_read_radix(&a, "123456", 10));
11691190

11701191
DO(mp_to_radix(&a, buf, sizeof(buf), &written, 10));
11711192
printf(" '123456' a == %s, length = %zu", buf, written);
11721193

1173-
/* See comment in mp_to_radix.c */
1174-
/*
1175-
if( (err = mp_to_radix(&a, buf, 3u, &written, 10) ) != MP_OKAY) goto LBL_ERR;
1176-
printf(" '56' a == %s, length = %zu\n", buf, written);
1177-
1178-
if( (err = mp_to_radix(&a, buf, 4u, &written, 10) ) != MP_OKAY) goto LBL_ERR;
1179-
printf(" '456' a == %s, length = %zu\n", buf, written);
1180-
if( (err = mp_to_radix(&a, buf, 30u, &written, 10) ) != MP_OKAY) goto LBL_ERR;
1181-
printf(" '123456' a == %s, length = %zu, error = %s\n",
1182-
buf, written, mp_error_to_string(err));
1183-
*/
11841194
DO(mp_read_radix(&a, "-123456", 10));
11851195
DO(mp_to_radix(&a, buf, sizeof(buf), &written, 10));
11861196
printf("\r '-123456' a == %s, length = %zu", buf, written);
@@ -1198,10 +1208,81 @@ static int test_mp_read_radix(void)
11981208
printf("%s, %lu\n", buf, (unsigned long)a.dp[0] & 3uL);
11991209
}
12001210

1201-
mp_clear(&a);
1211+
/* Safe a bit of testing time */
1212+
if (RUNNING_ON_VALGRIND != 0) {
1213+
limit_test = 2000;
1214+
} else {
1215+
limit_test = 6000;
1216+
}
1217+
1218+
/* Test the fast method with a slightly larger number (about a minute on an older machine) */
1219+
for (k = 100; k < limit_test; k += 1000) {
1220+
bignum = k;
1221+
buffer = (char *)malloc((size_t)(bignum + 2));
1222+
if (buffer == NULL) {
1223+
goto LBL_ERR;
1224+
}
1225+
DO(mp_rand(&a, bignum / MP_DIGIT_BIT));
1226+
for (i = 2; i < 65; i++) {
1227+
start = clock();
1228+
for (j = 0; j < 100; j++) {
1229+
DO(mp_to_radix(&a, buffer, (size_t)(bignum + 1), &written, i));
1230+
mp_zero(&b);
1231+
DO(mp_read_radix(&b, buffer, i));
1232+
/* Check roundabout */
1233+
EXPECT(mp_cmp(&a, &b) == MP_EQ);
1234+
}
1235+
stop = clock();
1236+
t_fast = stop - start;
1237+
1238+
start = clock();
1239+
for (j = 0; j < 100; j++) {
1240+
maxlen = (size_t)(bignum + 1);
1241+
bcpy = buffer;
1242+
/* s_mp_slower_to_radix is very rudimentary and needs some help to work as a stand-alone */
1243+
startb = bcpy;
1244+
DO(s_mp_slower_to_radix(&a, &bcpy, &maxlen, &written, i, false));
1245+
bcpy = startb;
1246+
mp_zero(&c);
1247+
DO(s_mp_slower_read_radix(&c, bcpy, 0, strlen(bcpy), i));
1248+
/* Check roundabout */
1249+
EXPECT(mp_cmp(&a, &c) == MP_EQ);
1250+
/* Check against result of fast algorithms above */
1251+
EXPECT(mp_cmp(&b, &c) == MP_EQ);
1252+
}
1253+
stop = clock();
1254+
t_slow = stop - start;
1255+
1256+
slow = (double)t_slow/(double)CLOCKS_PER_SEC;
1257+
fast = (double)t_fast/(double)CLOCKS_PER_SEC;
1258+
1259+
fprintf(stderr,"Bits %d Base %d SLOW: %.10f, FAST: %.10f\n", mp_count_bits(&a), i, slow, fast);
1260+
1261+
sum_slow += slow;
1262+
sum_fast += fast;
1263+
s_bases_slow[i] += slow;
1264+
s_bases_fast[i] += fast;
1265+
}
1266+
free(buffer);
1267+
}
1268+
1269+
fprintf(stderr,"\nSUM: SLOW: %.10f, FAST: %.10f\n",sum_slow, sum_fast);
1270+
1271+
for (i = 2; i < 65; i++) {
1272+
fprintf(stderr,"Sums for Base %d SLOW: %.10f, FAST: %.10f\n",i, s_bases_slow[i], s_bases_fast[i]);
1273+
}
1274+
1275+
/* Valgrind overhead does not allow for timings. */
1276+
if ((RUNNING_ON_VALGRIND == 0) && (MP_DIGIT_BIT >= 20)) {
1277+
/* Very basic check if the fast algorithms are actually faster. */
1278+
EXPECT(sum_slow > sum_fast);
1279+
}
1280+
1281+
1282+
mp_clear_multi(&a, &b, &c, NULL);
12021283
return EXIT_SUCCESS;
12031284
LBL_ERR:
1204-
mp_clear(&a);
1285+
mp_clear_multi(&a, &b, &c, NULL);
12051286
return EXIT_FAILURE;
12061287
}
12071288

@@ -2583,7 +2664,7 @@ static int unit_tests(int argc, char **argv)
25832664
T1(mp_prime_next_prime, MP_PRIME_NEXT_PRIME),
25842665
T1(mp_prime_rand, MP_PRIME_RAND),
25852666
T1(mp_rand, MP_RAND),
2586-
T1(mp_read_radix, MP_READ_RADIX),
2667+
T2(mp_read_radix,ONLY_PUBLIC_API, MP_READ_RADIX),
25872668
T1(mp_read_write_ubin, MP_TO_UBIN),
25882669
T1(mp_read_write_sbin, MP_TO_SBIN),
25892670
T1(mp_reduce_2k, MP_REDUCE_2K),
@@ -2600,13 +2681,16 @@ static int unit_tests(int argc, char **argv)
26002681
T3(s_mp_div_recursive, ONLY_PUBLIC_API, S_MP_DIV_RECURSIVE, S_MP_DIV_SCHOOL),
26012682
T3(s_mp_div_small, ONLY_PUBLIC_API, S_MP_DIV_SMALL, S_MP_DIV_SCHOOL),
26022683
T2(s_mp_sqr, ONLY_PUBLIC_API, S_MP_SQR),
2684+
26032685
/* s_mp_mul_comba not (yet) testable because s_mp_mul branches to s_mp_mul_comba automatically */
2686+
26042687
T2(s_mp_sqr_comba, ONLY_PUBLIC_API, S_MP_SQR_COMBA),
26052688
T2(s_mp_mul_balance, ONLY_PUBLIC_API, S_MP_MUL_BALANCE),
26062689
T2(s_mp_mul_karatsuba, ONLY_PUBLIC_API, S_MP_MUL_KARATSUBA),
26072690
T2(s_mp_sqr_karatsuba, ONLY_PUBLIC_API, S_MP_SQR_KARATSUBA),
26082691
T2(s_mp_mul_toom, ONLY_PUBLIC_API, S_MP_MUL_TOOM),
26092692
T2(s_mp_sqr_toom, ONLY_PUBLIC_API, S_MP_SQR_TOOM)
2693+
26102694
#undef T3
26112695
#undef T2
26122696
#undef T1

etc/get_limbsize.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#include <stdlib.h>
2+
#include <stdio.h>
3+
4+
#include "tommath.h"
5+
6+
int main(void)
7+
{
8+
printf("%d",MP_DIGIT_BIT);
9+
10+
exit(EXIT_SUCCESS);
11+
}

etc/makefile

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ LTM_TUNE_CFLAGS = $(CFLAGS) $(LTM_CFLAGS) -Wall -W -Wextra -Wshadow -O3 -I../
66
# libname when you can't install the lib with install
77
LIBNAME=../libtommath.a
88

9-
all: pprime tune test_standalone mersenne drprime 2kprime mont
9+
all: pprime tune test_standalone mersenne drprime 2kprime mont getlimbsize graph
1010

1111
#provable primes
1212
pprime: pprime.o
@@ -36,10 +36,20 @@ drprime: drprime.o
3636
mont: mont.o
3737
$(CC) $(LTM_TUNE_CFLAGS) mont.o $(LIBNAME) -o mont
3838

39+
# Reads MP_DIGIT_BIT from tommath.h, so take care that there is the correct MP_xxBIT
40+
# in CFLAGS and/or LTM_CFLAGS when compiling from this directory ("libtommath/etc").
41+
getlimbsize: get_limbsize.o
42+
$(CC) $(LTM_TUNE_CFLAGS) get_limbsize.o $(LIBNAME) -o get_limbsize
43+
44+
# Make pretty pictures (3000 is the maximum number of limbs to print for mul/sqr)
45+
# "tune" runs twice because it runs automatically when build.
46+
graphs: tune get_limbsize
47+
./tune_it.sh 3000
48+
gnuplot -c plot_graphs.gp `./get_limbsize`
3949

4050
clean:
4151
rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime mont 2kprime pprime.dat \
42-
tuning_list multiplying squaring test *.da *.dyn *.dpi *~
52+
tuning_list get_limbsize *.da *.dyn *.dpi *~
4353
rm -rf .libs
4454

4555
.PHONY: tune

etc/makefile.icc

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,18 @@ tune: tune.o
3232
$(CC) $(CFLAGS) tune.o $(LIBNAME) -o tune
3333
./tune_it.sh
3434

35+
# Reads MP_DIGIT_BIT from tommath.h, so take care that there is the correct MP_xxBIT
36+
# in CFLAGS and/or LTM_CFLAGS when compiling from this directory ("libtommath/etc").
37+
getlimbsize: get_limbsize.o
38+
$(CC) $(LTM_TUNE_CFLAGS) get_limbsize.o $(LIBNAME) -o get_limbsize
39+
40+
# Make pretty pictures (3000 is the maximum number of limbs to print for mul/sqr)
41+
# "tune" runs twice because it runs automatically when build.
42+
graphs: tune get_limbsize
43+
./tune_it.sh 3000
44+
gnuplot -c plot_graphs.gp `./get_limbsize`
45+
46+
3547
# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
3648
tune86: tune.c
3749
nasm -f coff timer.asm
@@ -64,4 +76,4 @@ mont: mont.o
6476

6577

6678
clean:
67-
rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il tuning_list
79+
rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat multiplying squaring readradix writeradix test get_limbsize *png *.il tuning_list

etc/plot_graphs.gp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
set term pngcairo size 720,540
2+
# Good for most colorblinds
3+
set colorsequence podo
4+
5+
set key top left;
6+
7+
set ylabel "Time"
8+
set xlabel "Operand size (limbs)"
9+
10+
set output "multiplying".ARG1.".png";
11+
set title "Comparing fast and slow multiplying [".ARG1." bits limbsize]";
12+
plot "multiplying".ARG1."" using 1:2 w lines t "slow", "multiplying".ARG1."" using 1:3 w lines t "fast"
13+
14+
set output "squaring".ARG1.".png";
15+
set title "Comparing fast and slow squaring [".ARG1." bits limbsize]";
16+
plot "squaring".ARG1."" using 1:2 w lines t "slow", "squaring".ARG1."" using 1:3 w lines t "fast"
17+
18+
set xlabel "Operand size (bits)"
19+
set output "readradix".ARG1.".png";
20+
set title "Comparing fast and slow radix conversion (reading) [".ARG1." bits limbsize]";
21+
plot "readradix".ARG1."" using 1:2 w lines t "slow", "readradix".ARG1."" using 1:3 w lines t "fast"
22+
23+
set output "writeradix".ARG1.".png";
24+
set title "Comparing fast and slow radix conversion (writing) [".ARG1." bits limbsize]";
25+
plot "writeradix".ARG1."" using 1:2 w lines t "slow", "writeradix".ARG1."" using 1:3 w lines t "fast"
26+
27+
28+
29+

0 commit comments

Comments
 (0)