-
Notifications
You must be signed in to change notification settings - Fork 20
/
faster_bytecode.cc
343 lines (282 loc) · 11.6 KB
/
faster_bytecode.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
/*
This file is part of the FAST-ER machine learning system.
Copyright (C) 2008 Edward Rosten and Los Alamos National Laboratory
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "faster_bytecode.h"
#include <cvd/image.h>
#include <cerrno>
///\cond never
using namespace CVD;
using namespace std;
///\endcond
#ifdef JIT
#include <sys/mman.h>
///This struct contains a x86 machine-code compiled version of the detector. The detector
///operates on a single row and inserts offset from the beginning of the image in to a
///std::vector.
///@ingroup gFastTree
class jit_detector
{
public:
///Run the compiled detector on a row of an image.
///@param im The image.
///@param row The row to detect corners in.
///@param xmin The starting position.
///@param xmax The ending position.
///@param corners The detected corners as offsets from image.data().
///@param threshold The corner detector threshold.
void detect_in_row(const Image<byte>& im, int row, int xmin, int xmax, vector<int>& corners, int threshold)
{
const byte* p = im[row] + xmin;
const int n = xmax - xmin;
void* cs = &corners;
const void* im_data = im.data();
/* r/m usage, at entry to machine code
In use:
%ecx Num remaining
%edi threshold
%ebp Detect in row machine code procedure address
%ebx cb
%edx c_b
%esi data
%eax Scratch
4%esp %esi: produced automatically by call
8%esp image.data()
12%esp &vector<int>
16%esp vector_inserter: simple function for calling member of std::vector
Input:
0 num remaining
1 data pointer
2 threshold
3 proc
4 push_back_proc
5 vector<int>
6 image.data()
*/
__asm__ __volatile__(
//Save all registers
" pusha \n"
//Load operands in to correct places
" pushl %4 \n"
" pushl %5 \n"
" pushl %6 \n"
" movl %0, %%ecx \n"
" movl %1, %%esi \n"
" movl %2, %%edi \n"
" movl %3, %%ebp \n" //%? uses ebp, so trash ebp last
//Start the loop
" cmp $0, %%ecx \n"
" je 1 \n"
" call *%%ebp \n"
"1: \n"
//Unload operands
" popl %%eax \n"
" popl %%eax \n"
" popl %%eax \n"
//Restore all registers
" popa \n"
:
: "m"(n), "m"(p), "m"(threshold), "m"(proc), "i"(&vector_inserter), "m"(cs), "m"(im_data)
);
}
///Create a compiled detector from the bytecode.
///@param v Bytecode.
jit_detector(const vector<block_bytecode::fast_detector_bit>& v)
{
//blocksize
const int bs=28;
length = bs * (v.size() + 2); //Add head and tail block
/* The original assembler code looked like this
This is now done in machine code, with the whole tree in
place of line 0x804e0c1.
804e0b3: 83 f9 00 cmp $0x0,%ecx
804e0b6: 74 1b je 804e0d3 <finished>
0804e0b8 <loop>:
804e0b8: 0f b6 16 movzbl (%esi),%edx
804e0bb: 89 d3 mov %edx,%ebx
804e0bd: 29 fa sub %edi,%edx
804e0bf: 01 fb add %edi,%ebx
804e0c1: ff d5 call *%ebp
804e0c3: a8 ff test $0xff,%al
804e0c5: 74 08 je 804e0cf <nocorner>
804e0c7: 56 push %esi
804e0c8: 51 push %ecx
804e0c9: ff 54 24 10 call *0x10(%esp)
804e0cd: 59 pop %ecx
804e0ce: 58 pop %eax
0804e0cf <nocorner>:
804e0cf: 46 inc %esi
804e0d0: 49 dec %ecx
804e0d1: 75 e5 jne 804e0b8 <loop> //jne == jnz
Unused spaces are filled in with int $3, (instruction 0xcc), which
causes a debug trap. Makes catching errors easier.
The consists of fixed sized blocks pasted together. The size is determined by the
largest block, which is a tree node. This makes jump computation trivial, but
it also means that short jumps are never used, and the code is therefore larger
than necessary.
The rest have 0xcc filled in in the spare places.
The blocks are templates and have the relevant parts filled in prior to
copying.
Each tree node (including leaves are represented by an entire block)
Detectod corners are inserted in to a vector<int> as the integer offset of the corner
pixel from the beginning of the image
*/
const unsigned char loop_head[bs] =
{
0xEB, 0x11, //jmp + 17
0xcc,0xcc,0xcc,0xcc,0xcc,0xcc, //dead space
0xcc,0xcc,0xcc,0xcc,0xcc,0xcc,
0xcc,0xcc,0xcc,0xcc,0xcc,
0x0f, 0xb6, 0x16, //movzbl (%esi),%edx Load data
0x89, 0xd3, //mov %edx,%ebx
0x29, 0xfa, //sub %edi,%edx Compute c_b
0x01, 0xfb, //add %edi,%ebx Compute cb
};
const int loop_head_start=19; //Jump to here to continue loop
unsigned char loop_tail[bs] =
{
0x56, //push %esi Functions seem to trash this otherwise
0x51, //push %ecx Functions seem to trash this otherwise
0xFF, 0x54, 0x24, 0x14, //call *16(%esp) Other arguments on the stack already
0x59, //pop %ecx Clean stack
0x58, //pop %eax ...
0x46, //inc %esi
0x49, //dec %ecx
0x0F, 0x85, 0xcc, 0xcc, 0xcc, 0xcc, //jnz <back to first block>
0xc3, //ret
0xcc,0xcc,0xcc,0xcc, //dead space
0xcc,0xcc,0xcc,0xcc,
0xcc,0xcc,0xcc,
};
const int loop_tail_address_offset = 12; //fill in the jump <back to first block> address here
const int loop_tail_jump_delta = 16; //Jump block_size*depth + this, to loop.
const int loop_tail_entry = 8; //jump to here to avoid inserting current point as corner
unsigned char cont_or_goto[bs] =
{
0xE9,0xcc, 0xcc, 0xcc, 0xcc, //Jump to end of loop
0xcc,0xcc,0xcc,0xcc,0xcc,0xcc, //dead space
0xcc,0xcc,0xcc,0xcc,0xcc,0xcc,
0xcc,0xcc,0xcc,0xcc,0xcc,0xcc,
0xcc,0xcc,0xcc,0xcc,0xcc
};
const int cont_jmp_addr = 1; //Jump address filled in here
const int cont_delta = 5; //This much in addition to block delta
unsigned char branch[bs] =
{
0x0f, 0xB6, 0x86, 0xcc, 0xcc, 0xcc, 0xcc, //movzbl OOOO(%esi),%eax
0x39, 0xd8, //cmp %ebx, %eax (eax - ebx) = (data[##]-cb
0x0F, 0x8F, 0xcc, 0xcc, 0xcc, 0xcc, //jg XXXX jmp by XXXX if eax > ebx
0x39, 0xC2, //cmp %eax, %edx (edx - eax) = c_b - data[##]
0x0F, 0x8F, 0xcc, 0xcc, 0xcc, 0xcc, //jg YYYY jmp by YYYY if ecx > ebx
0xE9, 0xcc, 0xcc, 0xcc, 0xcc, //jmp ZZZZ Unconditional jump to ZZZZ
};
const int block_off_off = 3;
const int block_gt_off = 11;
const int block_lt_off = 19;
const int block_eq_off = 24;
//mmap a writable, executable block of memory for JITted code
proc = (unsigned char*) mmap(0, length, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if(proc == MAP_FAILED)
{
cerr << "mmap failed with error: " << strerror(errno) << endl;
exit(1);
}
//Copy in the loop head: no parts to be filled in.
memcpy(proc, loop_head, bs);
for(int i=0; i < (int)v.size(); i++)
{
if(v[i].lt == 0) // leaf
{
if(v[i].gt == 0) //Fill in jump to continue part
{
*(int*)(cont_or_goto + cont_jmp_addr) = bs * (v.size()- i) - cont_delta + loop_tail_entry;
}
else //fill in jump to insert part
{
*(int*)(cont_or_goto + cont_jmp_addr) = bs * (v.size() - i) - cont_delta;
}
memcpy(proc + (i+1)*bs, cont_or_goto, bs);
}
else
{
*(int*)(branch+block_off_off) = v[i].offset;
//Optimization: leaf nodes have a non-conditional goto in them
//so goto the right place directly, rather than the leaf node.
//This has a 5% effect or so, so bigger gains elsewhere.
//Removed for simplicity.
*(int*)(branch+block_gt_off) = (v[i].gt -i) * bs - (block_gt_off + 4);
*(int*)(branch+block_lt_off) = (v[i].lt -i) * bs - (block_lt_off + 4);
*(int*)(branch+block_eq_off) = (v[i].eq -i) * bs - (block_eq_off + 4);
memcpy(proc + (i+1)*bs, branch, bs);
}
}
//Insert the correct backwards jump for looping
*(int*)(loop_tail+loop_tail_address_offset) = -bs * (1+v.size()) - loop_tail_jump_delta + loop_head_start;
memcpy(proc + bs * (v.size() + 1), loop_tail, bs);
}
///Destroy object, unmapping executable memory.
~jit_detector()
{
munmap(proc, length);
}
private:
///Prevent copying
void operator=(const jit_detector&);
///Prevent copying
jit_detector(const jit_detector&);
unsigned char* proc; ///< The machine code is stored in this mmap() allocated data which allows code execution.
int length; ///< Number of mmap() allocated bytes.
///Callback function to allow insertion in to std::vector. The execution of this function
///relies on the stack having the following layout (stack head on the left):
///@code
///return_address first_arg second_arg etc...
///@endcode
///so that the arguemnts directly reflect the stack layout.
///For speed, and in order to minimize stack handling, the argument list spans two call instructions worth of stack.
///
///@param ecx_dummy Pushed by the machine code, since the ABI allows ecx to be trashed
///@param p The pointer to the current pixel. Pushed by the machine code.
///@param esp_return_dummy Location to return to on a return from the machine code. Generated by the assembler call in to the machine code.
///@param im_data Pointer to the first image pixel. Pushed by the assembler caller.
///@param i Pointer to the std::vector<int> which stores the data. Pushed by the assembler caller.
static void vector_inserter(int ecx_dummy, const byte* p, const void* esp_return_dummy, const byte* im_data, vector<int>* i)
{
i->push_back(p-im_data);
}
};
#endif
///Detect corners in an image. The width of the image must match the width the
///detector was compiled to (using tree_elemeent::make_fast_detector for the
///results to make sense. The bytecode is JIT coimpiled if possible.
///@param im The image in which to detect corners
///@param corners Detected corners are inserted in to this container.
///@param threshold Corner detector threshold to use
///@param xmin x coordinate to start at.
///@param ymin y coordinate to start at.
///@param xmax x coordinate to go up to.
///@param ymax y coordinate to go up to.
void block_bytecode::detect(const CVD::Image<CVD::byte>& im, std::vector<int>& corners, int threshold, int xmin, int xmax, int ymin, int ymax)
{
#ifdef JIT
jit_detector jit(d);
for(int y = ymin; y < ymax; y++)
jit.detect_in_row(im, y, xmin, xmax, corners, threshold);
#else
cerr << "Hello!\n";
for(int y = ymin; y < ymax; y++)
for(int x=xmin; x < xmax; x++)
if(detect_no_score(&im[y][x], threshold))
corners.push_back(&im[y][x] - im.data());
#endif
}