【发布时间】:2014-02-28 02:37:17
【问题描述】:
在使用优化和内联汇编编译时,我的代码无法正常工作。
我认为实际发布程序集对我没有帮助,因为这是一个更普遍的问题。
我的内联汇编使用与 C 版本相同的变量(作为输入参数传递)
asm ( "" : : "r" (arg1), "r" (arg2) :);
我的问题是,我怎样才能让编译器不优化这些变量?只有在使用优化编译时,该功能才会因汇编而中断。我尝试了 volatile,但它仍然不正确。
谢谢。
更新:
我正在尝试为 OpenCV 实现 NEON 优化,特别是 lkpyramid.cpp 文件。问题是,在发布模式(设置了优化)它不能正常工作。但是,在调试模式下,它工作正常。我追踪了一个特定的变量(FLT_SCALE),该变量正在被优化并使其易变,之后该部分工作正常,但我仍然有另一个优化的不正确行为。
gcc 版本可能会有所不同,因为这是一个开源项目,但我目前使用的是 4.8.1。目标架构是带有 NEON 的 ARMv7。我正在测试的处理器是 ARM Cortex-A15(big.LITTLE)。
以下是我当前状态的代码。忽略所有的 cmets 和 volatiles(用于测试这个问题)。这是在制品。我删除了不相关的代码,所以我可以把它放在这里。我认为问题出在最底部的asm 块中,因为如果我使用if(false) 跳过它,我就不会遇到问题。谢谢。
#include "precomp.hpp"
#include <float.h>
#include <stdio.h>
#include "lkpyramid.hpp"
namespace
{
static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
{
using namespace cv;
using cv::detail::deriv_type;
int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth();
CV_Assert(depth == CV_8U);
dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));
#ifdef HAVE_TEGRA_OPTIMIZATION
if (tegra::calcSharrDeriv(src, dst))
return;
#endif
int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
AutoBuffer<deriv_type> _tempBuf(delta*2 + 64);
deriv_type *trow0 = alignPtr(_tempBuf + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
int three = 3, ten = 10;
for( y = 0; y < rows; y++ )
{
const uchar* srow0 = src.ptr<uchar>(y > 0 ? y-1 : rows > 1 ? 1 : 0);
const uchar* srow1 = src.ptr<uchar>(y);
const uchar* srow2 = src.ptr<uchar>(y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
deriv_type* drow = dst.ptr<deriv_type>(y);
// do vertical convolution
x = 0;
#ifdef CV_NEON
//assumes deriv_type is 16 bits
if(sizeof(deriv_type) == 2 && colsn >= 16)
{
__asm__ volatile ( "vdup.16 q8, %0\n\t"
"vdup.8 d18, %1\n\t"
:
: "r" (three), "r" (ten)
: );
for( ; x <= colsn - 8; x += 8)
{
__asm__ volatile ( "vld1.8 {d0}, [%0]\n\t"
"vld1.8 {d1}, [%1]\n\t"
"vld1.8 {d2}, [%2]\n\t"
"vaddl.u8 q4, d0, d2\n\t"
"vsubl.u8 q11, d2, d0\n\t"
"vmul.u16 q5, q4, q8\n\t"
"vmull.u8 q6, d1, d18\n\t"
"vadd.u16 q10, q6, q5\n\t"
"vst1.16 {q10}, [%3]\n\t"
"vst1.16 {q11}, [%4]\n\t"
:
: "r" (srow0 + x),
"r" (srow1 + x),
"r" (srow2 + x),
"r" (trow0 + x),
"r" (trow1 + x)
:
);
}
}
#endif
for( ; x < colsn; x++ )
{
int t0 = (srow0[x] + srow2[x])*3 + srow1[x]*10;
int t1 = srow2[x] - srow0[x];
trow0[x] = (deriv_type)t0;
trow1[x] = (deriv_type)t1;
}
// make border
int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols-2 : 0)*cn;
for( int k = 0; k < cn; k++ )
{
trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
}
#ifdef CV_NEON
__asm__ volatile ( "vdup.16 q8, %0\n\t"
"vdup.16 q9, %1\n\t"
:
: "r" (three), "r" (ten)
: );
#endif
// do horizontal convolution, interleave the results and store them to dst
x = 0;
#ifdef CV_NEON
//assumes size of deriv_type is 16 bits
if(sizeof(deriv_type) == 2 && colsn >= 16)
{
for( ; x <= colsn - 8; x += 8 )
{
__asm__ volatile (
"vld1.16 {q0}, [%0]\n\t" //trow0[x + cn]
"vld1.16 {q1}, [%1]\n\t" //trow0[x - cn]
"vsub.i16 q5, q0, q1\n\t" //this is t0
"vld1.16 {q2}, [%2]\n\t" //trow1[x + cn]
"vld1.16 {q3}, [%3]\n\t" //trow1[x - cn]
"vadd.i16 q6, q2, q3\n\t" //this needs mult by 3
"vld1.16 {q4}, [%4]\n\t" //trow1[x]
"vmul.i16 q7, q6, q8\n\t" //this needs to add to trow1[x]*10
"vmul.i16 q10, q4, q9\n\t" //this is trow1[x]*10
"vadd.i16 q11, q7, q10\n\t" //this is t1
"vswp d22, d11\n\t"
"vst2.16 {q5}, [%5]\n\t" //interleave
"vst2.16 {q11}, [%6]\n\t" //interleave
:
: "r" (trow0 + x + cn), //0
"r" (trow0 + x - cn), //1
"r" (trow1 + x + cn), //2
"r" (trow1 + x - cn), //3
"r" (trow1 + x), //4
"r" (drow + (x*2)), //5
"r" (drow + (x*2)+8) //6
:
);
}
}
#endif
for( ; x < colsn; x++ )
{
deriv_type t0 = (deriv_type)(trow0[x+cn] - trow0[x-cn]);
deriv_type t1 = (deriv_type)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
drow[x*2] = t0; drow[x*2+1] = t1;
}
}
}
}//namespace
cv::detail::LKTrackerInvoker::LKTrackerInvoker(
const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
const Point2f* _prevPts, Point2f* _nextPts,
uchar* _status, float* _err,
Size _winSize, TermCriteria _criteria,
int _level, int _maxLevel, int _flags, float _minEigThreshold )
{
prevImg = &_prevImg;
prevDeriv = &_prevDeriv;
nextImg = &_nextImg;
prevPts = _prevPts;
nextPts = _nextPts;
status = _status;
err = _err;
winSize = _winSize;
criteria = _criteria;
level = _level;
maxLevel = _maxLevel;
flags = _flags;
minEigThreshold = _minEigThreshold;
}
void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
{
Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
const Mat& I = *prevImg;
const Mat& J = *nextImg;
const Mat& derivI = *prevDeriv;
int j, cn = I.channels(), cn2 = cn*2;
cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
int derivDepth = DataType<deriv_type>::depth;
Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
for( int ptidx = range.start; ptidx < range.end; ptidx++ )
{
Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
Point2f nextPt;
if( level == maxLevel )
{
if( flags & OPTFLOW_USE_INITIAL_FLOW )
nextPt = nextPts[ptidx]*(float)(1./(1 << level));
else
nextPt = prevPt;
}
else
nextPt = nextPts[ptidx]*2.f;
nextPts[ptidx] = nextPt;
Point2i iprevPt, inextPt;
prevPt -= halfWin;
iprevPt.x = cvFloor(prevPt.x);
iprevPt.y = cvFloor(prevPt.y);
if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
{
if( level == 0 )
{
if( status )
status[ptidx] = false;
if( err )
err[ptidx] = 0;
}
continue;
}
volatile float a = prevPt.x - iprevPt.x;
volatile float b = prevPt.y - iprevPt.y;
volatile const int W_BITS = 14, W_BITS1 = 14;
volatile const float FLT_SCALE = 1.f/(1 << 20); //volatile is needed because compiler will optimize this out for NEON
volatile int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
volatile int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
volatile int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
volatile int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
volatile int dstep = (int)(derivI.step/derivI.elemSize1());
volatile int stepI = (int)(I.step/I.elemSize1());
volatile int stepJ = (int)(J.step/J.elemSize1());
volatile float A11 = 0, A12 = 0, A22 = 0;
#ifdef CV_NEON
volatile int CV_DECL_ALIGNED(16) nA11[] = {0, 0, 0, 0}, nA12[] = {0, 0, 0, 0}, nA22[] = {0, 0, 0, 0};
volatile const int shifter1 = -(W_BITS - 5); //negative so it shifts right
volatile const int shifter2 = -(W_BITS);
if(sizeof(deriv_type) == 2)
{
__asm__ volatile ( "vdup.16 d26, %0\n\t"
"vdup.16 d27, %1\n\t"
"vdup.16 d28, %2\n\t"
"vdup.16 d29, %3\n\t"
"vdup.32 q11, %4\n\t"
"vdup.32 q12, %5\n\t"
:
: "r" ((short)iw00),
"r" ((short)iw01),
"r" ((short)iw10),
"r" ((short)iw11),
"r" (shifter1),
"r" (shifter2)
: );
}
#endif
// extract the patch from the first image, compute covariation matrix of derivatives
volatile int x, y;
for( y = 0; y < winSize.height; y++ )
{
volatile const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*stepI + iprevPt.x*cn;
volatile const deriv_type* dsrc = (const deriv_type*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
volatile deriv_type* Iptr = (deriv_type*)(IWinBuf.data + y*IWinBuf.step);
volatile deriv_type* dIptr = (deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step);
x = 0;
#ifdef CV_NEON
if(sizeof(deriv_type) == 2 && winSize.width*cn >= 12)
{
for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
{
__asm__ volatile (
"vld1.8 {d0}, [%0]\n\t" //ignores last 4 bytes
"vmovl.u8 q0, d0\n\t" //expand to 16-bit
"vld1.8 {d2}, [%1]\n\t"
"vmovl.u8 q1, d2\n\t"
"vmull.s16 q5, d0, d26\n\t"
"vmull.s16 q6, d2, d27\n\t"
"vld1.8 {d4}, [%2]\n\t"
"vmovl.u8 q2, d4\n\t" //expand
"vld1.8 {d6}, [%3]\n\t"
"vmovl.u8 q3, d6\n\t"
"vmull.s16 q7, d4, d28\n\t"
"vmull.s16 q8, d6, d29\n\t"
"vadd.i32 q5, q5, q6\n\t"
"vadd.i32 q7, q7, q8\n\t"
"vadd.i32 q5, q5, q7\n\t"
"vld2.16 {d0, d1}, [%4]\n\t" //evens in d0 and d2
"vld2.16 {d2, d3}, [%5]\n\t"
"vqrshl.s32 q5, q5, q11\n\t"
"vmull.s16 q4, d0, d26\n\t" //q4 is mult of even 1
"vmull.s16 q6, d1, d26\n\t" //q6 is mult of odd 1
"vmovn.s32 d0, q5\n\t"
"vmull.s16 q7, d2, d27\n\t" //q7 is mult of even 2
"vmull.s16 q8, d3, d27\n\t" //q8 is mult of odd 2
"vst1.16 {d0}, [%8]\n\t"
"vld2.16 {d4, d5}, [%6]\n\t" //evens in d4 and d6
"vld2.16 {d6, d7}, [%7]\n\t"
"vadd.i32 q4, q4, q7\n\t" //this frees up q7 and q8
"vadd.i32 q6, q6, q8\n\t" //q4 is added even 1 and 2
//q6 is added odd 1 and 2
"vmull.s16 q7, d4, d28\n\t" //q7 is mult of even 3
"vmull.s16 q0, d5, d28\n\t" //q0 is mult of odd 3
"vmull.s16 q8, d6, d29\n\t" //q8 is mult of even 4
"vmull.s16 q15, d7, d29\n\t" //q15 is mult of odd 4
"vadd.i32 q7, q7, q8\n\t" //q7 is added even 3 and 4
"vadd.i32 q0, q0, q15\n\t" //q0 is added odd 3 and 4
"vadd.i32 q4, q4, q7\n\t" //q4 is added even 1,2,3,4 -- will be ixval
"vadd.i32 q6, q6, q0\n\t" //q6 is added odd 1,2,3,4 -- will be iyval
"vld1.32 {q1}, [%11]\n\t"
"vld1.32 {q2}, [%12]\n\t"
"vld1.32 {q0}, [%10]\n\t" //get the loads prepared
"vqrshl.s32 q4, q4, q12\n\t" //q4 is descaled evens added
"vqrshl.s32 q6, q6, q12\n\t" //q6 is descaled odds added
//now ixval is stored in q4 and iyval is stored in q6 and ival is in q5
"vmul.s32 q7, q4, q4\n\t"
"vmul.s32 q8, q4, q6\n\t"
"vmul.s32 q15, q6, q6\n\t"
"vadd.i32 q0, q0, q7\n\t"
"vadd.i32 q1, q1, q8\n\t"
"vadd.i32 q2, q2, q15\n\t"
"vst1.32 {q0}, [%10]\n\t"
"vst1.32 {q1}, [%11]\n\t"
"vst1.32 {q2}, [%12]\n\t"
"vmovn.i32 d8, q4\n\t" //bring ixval to short
"vmovn.i32 d12, q6\n\t" //bring iyval to short
"vswp d9, d12\n\t" //now d8 is ixval and d9 is iyval
"vst2.16 {d8, d9}, [%9]\n\t"
:
: "r" (src + x), //0
"r" (src + x + cn), //1
"r" (src + x + stepI), //2
"r" (src + x + stepI + cn), //3
"r" (dsrc), //4
"r" (dsrc + cn2), //5
"r" (dsrc + dstep), //6
"r" (dsrc + dstep + cn2), //7
"r" (Iptr + x), //8
"r" (dIptr), //9
"r" (nA11), //10
"r" (nA12), //11
"r" (nA22) //12
: );
}
}
#endif
for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
{
int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
src[x+stepI]*iw10 + src[x+stepI+cn]*iw11, W_BITS1-5);
int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
dsrc[dstep+cn2+1]*iw11, W_BITS1);
Iptr[x] = (short)ival;
dIptr[0] = (short)ixval;
dIptr[1] = (short)iyval;
A11 += (float)(ixval*ixval);
A12 += (float)(ixval*iyval);
A22 += (float)(iyval*iyval);
}
}
#ifdef CV_NEON
A11 += (float)(nA11[0] + nA11[1] + nA11[2] + nA11[3]);
A12 += (float)(nA12[0] + nA12[1] + nA12[2] + nA12[3]);
A22 += (float)(nA22[0] + nA22[1] + nA22[2] + nA22[3]);
#endif
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
volatile float D = A11*A22 - A12*A12;
float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
4.f*A12*A12))/(2*winSize.width*winSize.height);
if( err && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) != 0 )
err[ptidx] = (float)minEig;
if( minEig < minEigThreshold || D < FLT_EPSILON )
{
if( level == 0 && status )
status[ptidx] = false;
continue;
}
D = 1.f/D;
nextPt -= halfWin;
Point2f prevDelta;
for( j = 0; j < criteria.maxCount; j++ )
{
inextPt.x = cvFloor(nextPt.x);
inextPt.y = cvFloor(nextPt.y);
if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
inextPt.y < -winSize.height || inextPt.y >= J.rows )
{
if( level == 0 && status )
status[ptidx] = false;
break;
}
a = nextPt.x - inextPt.x;
b = nextPt.y - inextPt.y;
iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
float b1 = 0, b2 = 0;
for( y = 0; y < winSize.height; y++ )
{
const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*stepJ + inextPt.x*cn;
const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step);
const deriv_type* dIptr = (const deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step);
x = 0;
for( ; x < winSize.width*cn; x++, dIptr += 2 )
{
int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
W_BITS1-5) - Iptr[x];
b1 += (float)(diff*dIptr[0]);
b2 += (float)(diff*dIptr[1]);
}
}
b1 *= FLT_SCALE;
b2 *= FLT_SCALE;
Point2f delta( (float)((A12*b2 - A22*b1) * D),
(float)((A12*b1 - A11*b2) * D));
//delta = -delta;
nextPt += delta;
nextPts[ptidx] = nextPt + halfWin;
if( delta.ddot(delta) <= criteria.epsilon )
break;
if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 &&
std::abs(delta.y + prevDelta.y) < 0.01 )
{
nextPts[ptidx] -= delta*0.5f;
break;
}
prevDelta = delta;
}
if( status[ptidx] && err && level == 0 && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) == 0 )
{
Point2f nextPoint = nextPts[ptidx] - halfWin;
Point inextPoint;
inextPoint.x = cvFloor(nextPoint.x);
inextPoint.y = cvFloor(nextPoint.y);
if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols ||
inextPoint.y < -winSize.height || inextPoint.y >= J.rows )
{
if( status )
status[ptidx] = false;
continue;
}
float aa = nextPoint.x - inextPoint.x;
float bb = nextPoint.y - inextPoint.y;
iw00 = cvRound((1.f - aa)*(1.f - bb)*(1 << W_BITS));
iw01 = cvRound(aa*(1.f - bb)*(1 << W_BITS));
iw10 = cvRound((1.f - aa)*bb*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
float errval = 0.f;
for( y = 0; y < winSize.height; y++ )
{
const uchar* Jptr = (const uchar*)J.data + (y + inextPoint.y)*stepJ + inextPoint.x*cn;
const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step);
for( x = 0; x < winSize.width*cn; x++ )
{
int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
W_BITS1-5) - Iptr[x];
errval += std::abs((float)diff);
}
}
err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height);
}
}
}
【问题讨论】:
-
你用的是什么编译器?
-
请显示实际的
asm代码、编译器的版本和实际的目标处理器。 -
根据文档:
An asm instruction without any output operands is treated identically to a volatile asm instruction.- 所以volatile限定符隐含在您的代码中。您没有任何输出或 modify 任何寄存器的事实对我来说似乎很可疑。我不是 ARM 专家,但如果修改了寄存器,将其标记为仅输入是不够的。事实上这是很危险的,因为编译器可能会假设一个仅输入的寄存器不会在asm语句中发生变化,并且可能会继续使用它的 expected 值。 -
@BrettHale 感谢您的提示。我没有修改通用寄存器(只是它们指向的地址),但我正在修改 NEON 寄存器。我会尝试将这些寄存器添加到输出中。还是应该被破坏?
-
您正在更新内存而不是 C 变量,因此您不需要将向量添加到输出操作数列表中。但是我建议你破坏内存和所有修改(使用)的寄存器。您也可以尝试为您的输入地址使用特定的 ARM 寄存器,例如“[r1] "r" (srow0 + x)”,并破坏这些寄存器(本例中为 r1)。
标签: c++ optimization assembly