faster_bytecode.cc

00001 /*
00002 
00003     This file is part of the FAST-ER machine learning system.
00004     Copyright (C) 2008  Edward Rosten and Los Alamos National Laboratory
00005 
00006     This program is free software; you can redistribute it and/or modify
00007     it under the terms of the GNU General Public License as published by
00008     the Free Software Foundation; either version 2 of the License, or
00009     (at your option) any later version.
00010 
00011     This program is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014     GNU General Public License for more details.
00015 
00016     You should have received a copy of the GNU General Public License along
00017     with this program; if not, write to the Free Software Foundation, Inc.,
00018     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00019 */
00020 #include "faster_bytecode.h"
00021 #include <cvd/image.h>
00022 #include <cerrno>
00023 
00024 ///\cond never
00025 using namespace CVD;
00026 using namespace std;
00027 ///\endcond
00028 
00029 #ifdef JIT
00030 #include <sys/mman.h>
00031 ///This struct contains a x86 machine-code compiled version of the detector. The detector
00032 ///operates on a single row and inserts offset from the beginning of the image in to a 
00033 ///std::vector. 
00034 ///@ingroup gFastTree
00035 class jit_detector
00036 {
00037     public:
00038         
00039 
00040         ///Run the compiled detector on a row of an image.
00041         ///@param im The image.
00042         ///@param row The row to detect corners in.
00043         ///@param xmin The starting position.
00044         ///@param  xmax The ending position.
00045         ///@param corners The detected corners as offsets from image.data().
00046         ///@param threshold The corner detector threshold.
00047         void detect_in_row(const Image<byte>& im, int row, int xmin, int xmax, vector<int>& corners, int threshold)
00048         {
00049 
00050             const byte* p = im[row] + xmin;
00051             const int n = xmax - xmin;
00052             void* cs = &corners;
00053             const void* im_data = im.data();
00054             /* r/m usage, at entry to machine code
00055                 In use:
00056                     %ecx                Num remaining
00057                     %edi                threshold
00058                     %ebp                Detect in row machine code procedure address
00059                     %ebx                cb
00060                     %edx                c_b
00061                     %esi                data
00062                     %eax                Scratch
00063 
00064                     4%esp               %esi: produced automatically by call
00065                     8%esp               image.data()
00066                     12%esp              &vector<int>
00067                     16%esp              vector_inserter: simple function for calling member of std::vector
00068 
00069 
00070                 Input:  
00071                     0 num remaining
00072                     1 data pointer
00073                     2 threshold
00074                     3 proc 
00075                     4 push_back_proc
00076                     5 vector<int>
00077                     6 image.data()
00078             */
00079 
00080             __asm__ __volatile__(
00081                 //Save all registers
00082                 "   pusha                               \n"
00083                 
00084                 //Load operands in to correct places
00085                 "   pushl           %4                  \n"
00086                 "   pushl           %5                  \n"
00087                 "   pushl           %6                  \n"
00088                 "   movl            %0, %%ecx           \n" 
00089                 "   movl            %1, %%esi           \n"
00090                 "   movl            %2, %%edi           \n"
00091                 "   movl            %3, %%ebp           \n"   //%? uses ebp, so trash ebp last
00092 
00093                 
00094                 //Start the loop
00095                 "   cmp             $0, %%ecx           \n"
00096                 "   je              1                   \n"
00097                 "   call            *%%ebp              \n"
00098                 "1:                                     \n"
00099 
00100 
00101                 //Unload operands
00102                 "   popl            %%eax               \n"
00103                 "   popl            %%eax               \n"
00104                 "   popl            %%eax               \n"
00105 
00106                 //Restore all registers
00107                 "   popa                                \n"
00108                 :
00109                 : "m"(n), "m"(p), "m"(threshold), "m"(proc), "i"(&vector_inserter), "m"(cs), "m"(im_data)
00110             );
00111 
00112 
00113         }
00114 
00115 
00116         ///Create a compiled detector from the bytecode.
00117         ///@param v Bytecode.
00118         jit_detector(const vector<block_bytecode::fast_detector_bit>& v)
00119         {
00120             //blocksize
00121             const int bs=28;
00122 
00123             length = bs * (v.size() + 2); //Add head and tail block
00124 
00125             /* The original assembler code looked like this
00126                This is now done in machine code, with the whole tree in
00127                place of  line 0x804e0c1.
00128 
00129              804e0b3:   83 f9 00                cmp    $0x0,%ecx
00130              804e0b6:   74 1b                   je     804e0d3 <finished>
00131 
00132             0804e0b8 <loop>:
00133              804e0b8:   0f b6 16                movzbl (%esi),%edx
00134              804e0bb:   89 d3                   mov    %edx,%ebx
00135              804e0bd:   29 fa                   sub    %edi,%edx
00136              804e0bf:   01 fb                   add    %edi,%ebx
00137              804e0c1:   ff d5                   call   *%ebp
00138              804e0c3:   a8 ff                   test   $0xff,%al
00139              804e0c5:   74 08                   je     804e0cf <nocorner>
00140              804e0c7:   56                      push   %esi
00141              804e0c8:   51                      push   %ecx
00142              804e0c9:   ff 54 24 10             call   *0x10(%esp)
00143              804e0cd:   59                      pop    %ecx
00144              804e0ce:   58                      pop    %eax
00145 
00146             0804e0cf <nocorner>:
00147              804e0cf:   46                      inc    %esi
00148              804e0d0:   49                      dec    %ecx
00149              804e0d1:   75 e5                   jne    804e0b8 <loop>           //jne == jnz
00150 
00151             Unused spaces are filled in with int $3, (instruction 0xcc), which
00152             causes a debug trap. Makes catching errors easier.
00153             
00154             The consists of fixed sized blocks pasted together. The size is determined by the
00155             largest block, which is a tree node. This makes jump computation trivial, but 
00156             it also means that short jumps are never used, and the code is therefore larger
00157             than necessary.
00158 
00159             The rest have 0xcc filled in in the spare places. 
00160 
00161             The blocks are templates and have the relevant parts filled in prior to 
00162             copying.
00163 
00164             Each tree node (including leaves are represented by an entire block)
00165 
00166             Detectod corners are inserted in to a vector<int> as the integer offset of the corner
00167             pixel from the beginning of the image
00168             */
00169 
00170             const unsigned char loop_head[bs] = 
00171             {
00172                 0xEB, 0x11,                         //jmp + 17
00173 
00174                 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc,      //dead space
00175                 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc,
00176                 0xcc,0xcc,0xcc,0xcc,0xcc,
00177 
00178 
00179                 0x0f, 0xb6, 0x16,                   //movzbl (%esi),%edx            Load data
00180                 0x89, 0xd3,                         //mov    %edx,%ebx
00181                 0x29, 0xfa,                         //sub    %edi,%edx              Compute c_b
00182                 0x01, 0xfb,                         //add    %edi,%ebx              Compute cb
00183             };
00184             const int loop_head_start=19;           //Jump to here to continue loop
00185 
00186 
00187             unsigned char loop_tail[bs] = 
00188             {
00189                 0x56,                               //push %esi             Functions seem to trash this otherwise
00190                 0x51,                               //push %ecx             Functions seem to trash this otherwise
00191                 0xFF, 0x54, 0x24, 0x14,             //call *16(%esp)        Other arguments on the stack already
00192                 0x59,                               //pop %ecx              Clean stack
00193                 0x58,                               //pop %eax              ...
00194                 
00195                 0x46,                               //inc %esi
00196                 0x49,                               //dec %ecx
00197                 0x0F, 0x85, 0xcc, 0xcc, 0xcc, 0xcc, //jnz <back to first block>
00198 
00199                 0xc3,                               //ret
00200                 0xcc,0xcc,0xcc,0xcc,                //dead space 
00201                 0xcc,0xcc,0xcc,0xcc,
00202                 0xcc,0xcc,0xcc,
00203             };
00204             const int loop_tail_address_offset = 12;   //fill in the jump <back to first block> address here
00205             const int loop_tail_jump_delta     = 16;   //Jump block_size*depth + this, to loop.
00206             const int loop_tail_entry          = 8;    //jump to here to avoid inserting current point as corner
00207 
00208             unsigned char cont_or_goto[bs] = 
00209             {   
00210                 0xE9,0xcc, 0xcc, 0xcc, 0xcc,        //Jump to end of loop
00211                 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc,      //dead space
00212                 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc,
00213                 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc,
00214                 0xcc,0xcc,0xcc,0xcc,0xcc
00215             };
00216             const int cont_jmp_addr = 1;            //Jump address filled in here
00217             const int cont_delta = 5;               //This much in addition to block delta
00218             
00219             unsigned char branch[bs] = 
00220             {
00221                 0x0f, 0xB6, 0x86, 0xcc, 0xcc, 0xcc, 0xcc,   //movzbl   OOOO(%esi),%eax
00222                 0x39, 0xd8,                                 //cmp      %ebx, %eax   (eax - ebx) = (data[##]-cb
00223                 0x0F, 0x8F, 0xcc, 0xcc, 0xcc, 0xcc,         //jg       XXXX         jmp by XXXX if eax > ebx
00224                 0x39, 0xC2,                                 //cmp      %eax, %edx   (edx - eax) = c_b - data[##]
00225                 0x0F, 0x8F, 0xcc, 0xcc, 0xcc, 0xcc,         //jg       YYYY         jmp by YYYY if ecx > ebx
00226                 0xE9, 0xcc, 0xcc, 0xcc, 0xcc,               //jmp      ZZZZ         Unconditional jump to ZZZZ
00227             };
00228             const int block_off_off = 3;
00229             const int block_gt_off = 11;
00230             const int block_lt_off = 19;
00231             const int block_eq_off = 24;
00232 
00233 
00234             //mmap a writable, executable block of memory for JITted code
00235             proc = (unsigned char*) mmap(0, length, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
00236             if(proc == MAP_FAILED)
00237             {
00238                 cerr << "mmap failed with error: " << strerror(errno) << endl;
00239                 exit(1);
00240             }
00241             
00242             //Copy in the loop head: no parts to be filled in.
00243             memcpy(proc, loop_head, bs);
00244 
00245             for(int i=0; i < (int)v.size(); i++)
00246             {
00247                 if(v[i].lt == 0)            // leaf
00248                 {
00249                     if(v[i].gt == 0) //Fill in jump to continue part
00250                     {
00251                         *(int*)(cont_or_goto + cont_jmp_addr) = bs * (v.size()- i) - cont_delta + loop_tail_entry;
00252                     }
00253                     else //fill in jump to insert part
00254                     {
00255                         *(int*)(cont_or_goto + cont_jmp_addr) = bs * (v.size() - i) - cont_delta;
00256                     }
00257                         
00258 
00259                     memcpy(proc + (i+1)*bs, cont_or_goto, bs);
00260                 }
00261                 else
00262                 {
00263                     *(int*)(branch+block_off_off)  = v[i].offset;
00264 
00265                     //Optimization: leaf nodes have a non-conditional goto in them
00266                     //so goto the right place directly, rather than the leaf node.
00267                     //This has a 5% effect or so, so bigger gains elsewhere.
00268                     //Removed for simplicity.
00269                     
00270                     *(int*)(branch+block_gt_off) = (v[i].gt -i) * bs - (block_gt_off + 4);
00271                     *(int*)(branch+block_lt_off) = (v[i].lt -i) * bs - (block_lt_off + 4);
00272                     *(int*)(branch+block_eq_off) = (v[i].eq -i) * bs - (block_eq_off + 4);
00273 
00274                     memcpy(proc + (i+1)*bs, branch, bs);
00275                 }
00276             }
00277             
00278             //Insert the correct backwards jump for looping
00279             *(int*)(loop_tail+loop_tail_address_offset) = -bs * (1+v.size()) - loop_tail_jump_delta + loop_head_start;
00280             memcpy(proc + bs * (v.size() + 1), loop_tail, bs);
00281 
00282         }
00283 
00284         ///Destroy object, unmapping executable memory.
00285         ~jit_detector()
00286         {
00287             munmap(proc, length);
00288         }
00289 
00290     private:
00291         ///Prevent copying
00292         void operator=(const jit_detector&);
00293         ///Prevent copying
00294         jit_detector(const jit_detector&);
00295 
00296         unsigned char* proc;            ///< The machine code is stored in this mmap() allocated data which allows code execution.
00297         int            length;          ///< Number of mmap() allocated bytes.
00298 
00299         ///Callback function to allow insertion in to std::vector. The execution of this function
00300         ///relies on the stack having the following layout (stack head on the left):
00301         ///@code
00302         ///return_address first_arg second_arg etc...
00303         ///@endcode
00304         ///so that the arguemnts directly reflect the stack layout.
00305         ///For speed, and in order to minimize stack handling, the argument list spans two call instructions worth of stack.
00306         ///
00307         ///@param ecx_dummy Pushed by the machine code, since the ABI allows ecx to be trashed
00308         ///@param p The pointer to the current pixel. Pushed by the machine code.
00309         ///@param esp_return_dummy Location to return to on a return from the machine code. Generated by the assembler call in to the machine code.
00310         ///@param im_data Pointer to the first image pixel. Pushed by the assembler caller.
00311         ///@param i  Pointer to the std::vector<int> which stores the data. Pushed by the assembler caller.
00312         static void vector_inserter(int ecx_dummy, const byte* p, const void* esp_return_dummy, const byte* im_data, vector<int>* i)
00313         {
00314             i->push_back(p-im_data);
00315         }
00316 };
00317 #endif
00318 
00319 ///Detect corners in an image. The width of the image must match the width the
00320 ///detector was compiled to (using tree_elemeent::make_fast_detector for the
00321 ///results to make sense. The bytecode is JIT coimpiled if possible.
00322 ///@param im The image in which to detect corners
00323 ///@param corners Detected corners are inserted in to this container.
00324 ///@param threshold Corner detector threshold to use
00325 ///@param xmin x coordinate to start at.
00326 ///@param ymin y coordinate to start at.
00327 ///@param xmax x coordinate to go up to.
00328 ///@param ymax y coordinate to go up to.
00329 void block_bytecode::detect(const CVD::Image<CVD::byte>& im, std::vector<int>& corners, int threshold, int xmin, int xmax, int ymin, int ymax)
00330 {
00331     #ifdef JIT
00332         jit_detector jit(d);
00333         for(int y = ymin; y < ymax; y++)
00334             jit.detect_in_row(im, y, xmin, xmax, corners, threshold);
00335     #else
00336         cerr << "Hello!\n";
00337         for(int y = ymin; y < ymax; y++)
00338             for(int x=xmin; x < xmax; x++)
00339                 if(detect_no_score(&im[y][x], threshold))
00340                     corners.push_back(&im[y][x] - im.data());
00341     #endif
00342 }
00343 

Generated on Mon Mar 2 12:47:12 2009 for FAST-ER by  doxygen 1.5.3