forked from verilator/verilator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathverilated_profiler.cpp
230 lines (200 loc) · 8.55 KB
/
verilated_profiler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// Code available from: https://verilator.org
//
// Copyright 2012-2024 by Wilson Snyder. This program is free software; you can
// redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//=============================================================================
///
/// \file
/// \brief Verilated run-time profiling implementation code
///
//=============================================================================
#include "verilatedos.h"
#include "verilated_profiler.h"
#include "verilated_threads.h"
#include <fstream>
#include <string>
//=============================================================================
// Globals
// Internal note: Globals may multi-construct, see verilated.cpp top.
thread_local VlExecutionProfiler::ExecutionTrace VlExecutionProfiler::t_trace;
constexpr const char* const VlExecutionRecord::s_ascii[];
//=============================================================================
// VlPgoProfiler implementation
uint16_t VlExecutionRecord::getcpu() {
#if defined(__linux)
return sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
#elif defined(__APPLE__) && !defined(__arm64__)
uint32_t info[4];
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
// info[1] is EBX, bits 24-31 are APIC ID
if ((info[3] & (1 << 9)) == 0) {
return -1; // no APIC on chip
} else {
return (unsigned)info[1] >> 24;
}
#elif defined(_WIN32)
return GetCurrentProcessorNumber();
#else
return 0;
#endif
}
//=============================================================================
// VlExecutionProfiler implementation
template <size_t N>
static size_t roundUptoMultipleOf(size_t value) {
static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
size_t mask = N - 1;
return (value + mask) & ~mask;
}
VlExecutionProfiler::VlExecutionProfiler(VerilatedContext& context)
: m_context{context} {
// Setup profiling on main thread
setupThread(0);
}
void VlExecutionProfiler::configure() {
if (VL_UNLIKELY(m_enabled)) {
--m_windowCount;
if (VL_UNLIKELY(m_windowCount == m_context.profExecWindow())) {
VL_DEBUG_IF(VL_DBG_MSGF("+ profile start collection\n"););
clear(); // Clear the profile after the cache warm-up cycles.
m_tickBegin = VL_CPU_TICK();
} else if (VL_UNLIKELY(m_windowCount == 0)) {
const uint64_t tickEnd = VL_CPU_TICK();
VL_DEBUG_IF(VL_DBG_MSGF("+ profile end\n"););
const std::string& fileName = m_context.profExecFilename();
dump(fileName.c_str(), tickEnd);
m_enabled = false;
}
return;
}
const uint64_t startReq = m_context.profExecStart() + 1; // + 1, so we can start at time 0
if (VL_UNLIKELY(m_lastStartReq < startReq && VL_TIME_Q() >= m_context.profExecStart())) {
VL_DEBUG_IF(VL_DBG_MSGF("+ profile start warmup\n"););
VL_DEBUG_IF(assert(m_windowCount == 0););
m_enabled = true;
m_windowCount = m_context.profExecWindow() * 2;
m_lastStartReq = startReq;
}
}
VerilatedVirtualBase* VlExecutionProfiler::construct(VerilatedContext& context) {
VlExecutionProfiler* const selfp = new VlExecutionProfiler{context};
if (VlThreadPool* const threadPoolp = static_cast<VlThreadPool*>(context.threadPoolp())) {
for (int i = 0; i < threadPoolp->numThreads(); ++i) {
// Data to pass to worker thread initialization
struct Data final {
VlExecutionProfiler* const selfp;
const uint32_t threadId;
} data{selfp, static_cast<uint32_t>(i + 1)};
// Initialize worker thread
threadPoolp->workerp(i)->addTask(
[](void* userp, bool) {
Data* const datap = static_cast<Data*>(userp);
datap->selfp->setupThread(datap->threadId);
},
&data);
// Wait until initialization is complete
threadPoolp->workerp(i)->wait();
}
}
return selfp;
}
void VlExecutionProfiler::setupThread(uint32_t threadId) {
// Reserve some space in the thread-local profiling buffer, in order to try to avoid malloc
// while profiling.
t_trace.reserve(RESERVED_TRACE_CAPACITY);
// Register thread-local buffer in list of all buffers
bool exists;
{
const VerilatedLockGuard lock{m_mutex};
exists = !m_traceps.emplace(threadId, &t_trace).second;
}
if (VL_UNLIKELY(exists)) {
VL_FATAL_MT(__FILE__, __LINE__, "", "multiple initialization of profiler on some thread");
}
}
void VlExecutionProfiler::clear() VL_MT_SAFE_EXCLUDES(m_mutex) {
const VerilatedLockGuard lock{m_mutex};
for (const auto& pair : m_traceps) {
ExecutionTrace* const tracep = pair.second;
const size_t reserve = roundUptoMultipleOf<RESERVED_TRACE_CAPACITY>(tracep->size());
tracep->clear();
tracep->reserve(reserve);
}
}
void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
VL_MT_SAFE_EXCLUDES(m_mutex) {
const VerilatedLockGuard lock{m_mutex};
VL_DEBUG_IF(VL_DBG_MSGF("+prof+exec writing to '%s'\n", filenamep););
FILE* const fp = std::fopen(filenamep, "w");
if (VL_UNLIKELY(!fp)) { VL_FATAL_MT(filenamep, 0, "", "+prof+exec+file file not writable"); }
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n");
fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
Verilated::threadContextp()->profExecStart());
fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
Verilated::threadContextp()->profExecWindow());
// Note that VerilatedContext will by default create as many threads as there are hardware
// processors, but not all of them might be utilized. Report the actual number that has trace
// entries to avoid over-counting.
unsigned threads = 0;
for (const auto& pair : m_traceps) {
if (!pair.second->empty()) ++threads;
}
fprintf(fp, "VLPROF stat threads %u\n", threads);
fprintf(fp, "VLPROF stat yields %" PRIu64 "\n", VlMTaskVertex::yields());
// Copy /proc/cpuinfo into this output so verilator_gantt can be run on
// a different machine
{
const std::unique_ptr<std::ifstream> ifp{new std::ifstream{"/proc/cpuinfo"}};
if (!ifp->fail()) {
std::string line;
while (std::getline(*ifp, line)) { fprintf(fp, "VLPROFPROC %s\n", line.c_str()); }
}
}
for (const auto& pair : m_traceps) {
const uint32_t threadId = pair.first;
ExecutionTrace* const tracep = pair.second;
if (tracep->empty()) continue;
fprintf(fp, "VLPROFTHREAD %" PRIu32 "\n", threadId);
for (const VlExecutionRecord& er : *tracep) {
const char* const name = VlExecutionRecord::s_ascii[static_cast<uint8_t>(er.m_type)];
const uint64_t time = er.m_tick - m_tickBegin;
fprintf(fp, "VLPROFEXEC %s %" PRIu64, name, time);
switch (er.m_type) {
case VlExecutionRecord::Type::SECTION_POP:
case VlExecutionRecord::Type::EXEC_GRAPH_BEGIN:
case VlExecutionRecord::Type::EXEC_GRAPH_END:
// No payload
fprintf(fp, "\n");
break;
case VlExecutionRecord::Type::MTASK_BEGIN: {
const auto& payload = er.m_payload.mtaskBegin;
fprintf(fp, " id %u predictStart %u cpu %u\n", payload.m_id,
payload.m_predictStart, payload.m_cpu);
break;
}
case VlExecutionRecord::Type::MTASK_END: {
const auto& payload = er.m_payload.mtaskEnd;
fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost);
break;
}
case VlExecutionRecord::Type::SECTION_PUSH: {
const auto& payload = er.m_payload.sectionPush;
fprintf(fp, " %s\n", payload.m_name);
break;
}
default: abort(); // LCOV_EXCL_LINE
}
}
}
fprintf(fp, "VLPROF stat ticks %" PRIu64 "\n", tickEnd - m_tickBegin);
std::fclose(fp);
}