Skip to content

Commit a794131

Browse files
committed
Added timing code to CUDA (it won't print when timing)
1 parent dac463c commit a794131

File tree

4 files changed

+220
-13
lines changed

4 files changed

+220
-13
lines changed

Diff for: cycleTimer.h

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#ifndef _SYRAH_CYCLE_TIMER_H_
2+
#define _SYRAH_CYCLE_TIMER_H_
3+
4+
#if defined(__APPLE__)
5+
#if defined(__x86_64__)
6+
#include <sys/sysctl.h>
7+
#else
8+
#include <mach/mach.h>
9+
#include <mach/mach_time.h>
10+
#endif // __x86_64__ or not
11+
12+
#include <stdio.h> // fprintf
13+
#include <stdlib.h> // exit
14+
15+
#elif _WIN32
16+
# include <windows.h>
17+
# include <time.h>
18+
#else
19+
# include <stdio.h>
20+
# include <stdlib.h>
21+
# include <string.h>
22+
# include <sys/time.h>
23+
#endif
24+
25+
26+
// This uses the cycle counter of the processor. Different
27+
// processors in the system will have different values for this. If
28+
// you process moves across processors, then the delta time you
29+
// measure will likely be incorrect. This is mostly for fine
30+
// grained measurements where the process is likely to be on the
31+
// same processor. For more global things you should use the
32+
// Time interface.
33+
34+
// Also note that if you processors' speeds change (i.e. processors
35+
// scaling) or if you are in a heterogenous environment, you will
36+
// likely get spurious results.
37+
class CycleTimer {
38+
public:
39+
typedef unsigned long long SysClock;
40+
41+
//////////
42+
// Return the current CPU time, in terms of clock ticks.
43+
// Time zero is at some arbitrary point in the past.
44+
static SysClock currentTicks() {
45+
#if defined(__APPLE__) && !defined(__x86_64__)
46+
return mach_absolute_time();
47+
#elif defined(_WIN32)
48+
LARGE_INTEGER qwTime;
49+
QueryPerformanceCounter(&qwTime);
50+
return qwTime.QuadPart;
51+
#elif defined(__x86_64__)
52+
unsigned int a, d;
53+
asm volatile("rdtsc" : "=a" (a), "=d" (d));
54+
return static_cast<unsigned long long>(a) |
55+
(static_cast<unsigned long long>(d) << 32);
56+
#elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
57+
unsigned int val;
58+
asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
59+
return val;
60+
#else
61+
timespec spec;
62+
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
63+
return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
64+
#endif
65+
}
66+
67+
//////////
68+
// Return the current CPU time, in terms of seconds.
69+
// This is slower than currentTicks(). Time zero is at
70+
// some arbitrary point in the past.
71+
static double currentSeconds() {
72+
return currentTicks() * secondsPerTick();
73+
}
74+
75+
//////////
76+
// Return the conversion from seconds to ticks.
77+
static double ticksPerSecond() {
78+
return 1.0/secondsPerTick();
79+
}
80+
81+
static const char* tickUnits() {
82+
#if defined(__APPLE__) && !defined(__x86_64__)
83+
return "ns";
84+
#elif defined(__WIN32__) || defined(__x86_64__)
85+
return "cycles";
86+
#else
87+
return "ns"; // clock_gettime
88+
#endif
89+
}
90+
91+
//////////
92+
// Return the conversion from ticks to seconds.
93+
static double secondsPerTick() {
94+
static bool initialized = false;
95+
static double secondsPerTick_val;
96+
if (initialized) return secondsPerTick_val;
97+
#if defined(__APPLE__)
98+
#ifdef __x86_64__
99+
int args[] = {CTL_HW, HW_CPU_FREQ};
100+
unsigned int Hz;
101+
size_t len = sizeof(Hz);
102+
if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103+
fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104+
exit(-1);
105+
}
106+
secondsPerTick_val = 1.0 / (double) Hz;
107+
#else
108+
mach_timebase_info_data_t time_info;
109+
mach_timebase_info(&time_info);
110+
111+
// Scales to nanoseconds without 1e-9f
112+
secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113+
static_cast<double>(time_info.denom);
114+
#endif // x86_64 or not
115+
#elif defined(_WIN32)
116+
LARGE_INTEGER qwTicksPerSec;
117+
QueryPerformanceFrequency(&qwTicksPerSec);
118+
secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119+
#else
120+
FILE *fp = fopen("/proc/cpuinfo","r");
121+
char input[1024];
122+
if (!fp) {
123+
fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124+
exit(-1);
125+
}
126+
// In case we don't find it, e.g. on the N900
127+
secondsPerTick_val = 1e-9;
128+
while (!feof(fp) && fgets(input, 1024, fp)) {
129+
// NOTE(boulos): Because reading cpuinfo depends on dynamic
130+
// frequency scaling it's better to read the @ sign first
131+
float GHz, MHz;
132+
if (strstr(input, "model name")) {
133+
char* at_sign = strstr(input, "@");
134+
if (at_sign) {
135+
char* after_at = at_sign + 1;
136+
char* GHz_str = strstr(after_at, "GHz");
137+
char* MHz_str = strstr(after_at, "MHz");
138+
if (GHz_str) {
139+
*GHz_str = '\0';
140+
if (1 == sscanf(after_at, "%f", &GHz)) {
141+
//printf("GHz = %f\n", GHz);
142+
secondsPerTick_val = 1e-9f / GHz;
143+
break;
144+
}
145+
} else if (MHz_str) {
146+
*MHz_str = '\0';
147+
if (1 == sscanf(after_at, "%f", &MHz)) {
148+
//printf("MHz = %f\n", MHz);
149+
secondsPerTick_val = 1e-6f / GHz;
150+
break;
151+
}
152+
}
153+
}
154+
} else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155+
//printf("MHz = %f\n", MHz);
156+
secondsPerTick_val = 1e-6f / MHz;
157+
break;
158+
}
159+
}
160+
fclose(fp);
161+
#endif
162+
163+
initialized = true;
164+
return secondsPerTick_val;
165+
}
166+
167+
//////////
168+
// Return the conversion from ticks to milliseconds.
169+
static double msPerTick() {
170+
return secondsPerTick() * 1000.0;
171+
}
172+
173+
private:
174+
CycleTimer();
175+
};
176+
177+
#endif // #ifndef _SYRAH_CYCLE_TIMER_H_

Diff for: nfa.c

+36-8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
*/
1515

1616
#include "pnfa.h"
17+
#include "cycleTimer.h"
1718

1819
#define DEBUG
1920
#ifdef DEBUG
@@ -345,7 +346,7 @@ main(int argc, char **argv)
345346
char *post;
346347
SimpleReBuilder builder;
347348
State *start;
348-
double starttime, endtime;
349+
double startTime, endTime, endReadFile, endCopyNFAToDevice, endCopyStringsToDevice, endCudaMallocs, endCudaMallocs2, endCudaMemcpys, endPMatch;
349350
char **lines;
350351
int lineIndex;
351352

@@ -408,23 +409,23 @@ main(int argc, char **argv)
408409

409410
// if no file is specified
410411
if (fileName == NULL) {
411-
starttime = gettime();
412+
startTime = CycleTimer::currentSeconds();
412413
for(i=optIndex+1; i<argc; i++) {
413414
if(anyMatch(start, argv[i]))
414415
printf("%d: %s\n", i-(optIndex), argv[i]);
415416
}
416-
endtime = gettime();
417+
endTime = CycleTimer::currentSeconds();
417418
}
418419
else {
419420
readFile(fileName, &lines, &lineIndex);
420421

421-
starttime = gettime();
422+
startTime = CycleTimer::currentSeconds();
422423
for (i = 0; i < lineIndex; i++) {
423424
if (anyMatch(start, lines[i]))
424425
//TODO need to put this statement out side loop body
425426
printf("%s", lines[i]);
426427
}
427-
endtime = gettime();
428+
endTime = CycleTimer::currentSeconds();
428429

429430
for (i = 0; i <= lineIndex; i++)
430431
free(lines[i]);
@@ -440,20 +441,31 @@ main(int argc, char **argv)
440441
exit(EXIT_SUCCESS);
441442
}
442443

444+
startTime = CycleTimer::currentSeconds();
445+
443446
readFile(fileName, &lines, &lineIndex);
447+
448+
endReadFile = CycleTimer::currentSeconds();
444449

445450
State *device_start;
446451
char **device_lines;
452+
447453

448454
copyNFAToDevice(&device_start, start);
455+
456+
endCopyNFAToDevice = CycleTimer::currentSeconds();
449457

450458
copyStringsToDevice(lines, lineIndex, &device_lines);
459+
460+
endCopyStringsToDevice = CycleTimer::currentSeconds();
451461

452462
List *dl1;
453463
List *dl2;
454464

455465
cudaMalloc((void **) &dl1, sizeof (List));
456466
cudaMalloc((void **) &dl2, sizeof (List));
467+
468+
endCudaMallocs = CycleTimer::currentSeconds();
457469

458470
fflush(stdout);
459471

@@ -462,20 +474,36 @@ main(int argc, char **argv)
462474
cudaMalloc((void **) &(s1), nstate * sizeof (State *));
463475
cudaMalloc((void **) &(s2), nstate * sizeof (State *));
464476

477+
endCudaMallocs2 = CycleTimer::currentSeconds();
478+
465479
cudaMemcpy(&(dl1->s), &s1, sizeof (State **), cudaMemcpyHostToDevice);
466480
cudaMemcpy(&(dl2->s), &s2, sizeof (State **), cudaMemcpyHostToDevice);
481+
482+
endCudaMemcpys = CycleTimer::currentSeconds();
467483

468-
pMatch(device_start, device_lines, lineIndex, dl1, dl2);
484+
pMatch(device_start, device_lines, lineIndex, dl1, dl2, time);
485+
486+
endPMatch = CycleTimer::currentSeconds();
469487

470488
for (i = 0; i <= lineIndex; i++)
471489
free(lines[i]);
472490
free(lines);
473491

474492
}
475493

476-
if (time) {
477-
printf("\nTime taken %f \n\n", (endtime - starttime));
494+
if (time && !parallel) {
495+
printf("\nSequential Time taken %.4f \n\n", (endTime - startTime));
478496
}
497+
else if (time && parallel) {
498+
printf("\nParallel ReadFile Time taken %.4f \n", (endReadFile - startTime));
499+
printf("\nParallel CopyNFAToDevice Time taken %.4f \n", (endCopyNFAToDevice - endReadFile));
500+
printf("\nParallel CopyStringsToDevice Time taken %.4f \n", (endCopyStringsToDevice - endCopyNFAToDevice));
501+
printf("\nParallel CudaMallocs Time taken %.4f \n\n", (endCudaMallocs - endCopyStringsToDevice));
502+
printf("\nParallel CudaMallocs2 Time taken %.4f \n\n", (endCudaMallocs2 - endCudaMallocs));
503+
printf("\nParallel CudaMemcpys Time taken %.4f \n\n", (endCudaMemcpys - endCudaMallocs2));
504+
printf("\nParallel pMatch Time taken %.4f \n\n", (endPMatch - endCudaMemcpys));
505+
printf("\nParallel Total Time taken %.4f \n\n", (endPMatch - startTime));
506+
}
479507
// free up memory
480508
freeNFAStates(start);
481509

Diff for: pnfa.cu

+6-4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
#include "pnfa.h"
99

10+
#define PRINT(time,...) if(!time) printf(__VA_ARGS__)
11+
1012
__device__ List *dl1, *dl2;
1113
__device__ static int dlistid;
1214
__device__ State pmatchstate = { Match }; /* matching state */
@@ -121,14 +123,14 @@ __device__ inline int panypmatch(State *start, char *s) {
121123
}
122124

123125

124-
__global__ void parallelMatch(State *start, char **lines, int lineIndex, List* ddl1, List *ddl2) {
126+
__global__ void parallelMatch(State *start, char **lines, int lineIndex, List* ddl1, List *ddl2, int time) {
125127
dl1 = ddl1;
126128
dl2 = ddl2;
127129

128130
int i;
129131
for (i = 0; i < lineIndex; i++) {
130132
if (panypmatch(start, lines[i]))
131-
printf("%s", lines[i]);
133+
PRINT(time, "%s", lines[i]);
132134
}
133135

134136
/*
@@ -140,9 +142,9 @@ __global__ void parallelMatch(State *start, char **lines, int lineIndex, List* d
140142

141143
}
142144

143-
void pMatch(State *start, char **lines, int lineIndex, List* ddl1, List *ddl2) {
145+
void pMatch(State *start, char **lines, int lineIndex, List* ddl1, List *ddl2, int time) {
144146
//printCudaInfo();
145-
parallelMatch<<<1,1>>>(start,lines,lineIndex,ddl1,ddl2);
147+
parallelMatch<<<1,1>>>(start,lines,lineIndex,ddl1,ddl2,time);
146148

147149

148150
//TODO free states

Diff for: pnfa.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
#include "nfautil.h"
33
#include "regex.h"
44

5-
void pMatch(State *start, char **lines, int lineIndex, List* ddl1, List *ddl2);
5+
void pMatch(State *start, char **lines, int lineIndex, List* ddl1, List *ddl2, int time);
66
void printCudaInfo();
77

0 commit comments

Comments
 (0)