I try optimize source code for experimental from 2 years ago, but found some bug in code. Someone dolphin developer, please check.
================================
[1] Lucid bug
================================
(1) Line 442 in OpcodeDecoding.cpp:
*DataReadU32xFuncs = *DataReadU32xFuncs_SSSE3;
Replace to:
for (int i = 0; i < 16; ++i) DataReadU32xFuncs[i] = DataReadU32xFuncs_SSSE3[i];
--------------------------------
(2) Line 443 in JitAsmCommon.cpp:
MOVD_xmm(XMM0, M(&psTemp[0]));
Replace to:
MOVD_xmm(XMM0, R(EAX));
--------------------------------
(3) Line 124 in DSPLLE.cpp:
Common::SetCurrentThreadAffinity(1 << core_id);
Replace to:
Common::SetCurrentThreadAffinity(1 << (core_id - 1));
--------------------------------
(4) Line 152 in VertexLoader_Color.cpp
const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR]+colIndex);
Replace to:
const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
================================
[2] Probably bug (careless miss?)
================================
(1) Line 287 in Jit_Util.cpp:
if (false && cpu_info.bSSSE3) {
This line must move to line 272.
--------------------------------
(2) Line 142 in Jit_LoadStore.cpp:
gpr.Flush(FLUSH_ALL);
This line can delete.
================================
[3] Might solve bug (Expected to speed up)
================================
(1) Jit64::ps_sel() function in Jit_Paired.cpp: (I confirmed fix 'sengoku musou 3', more need check at anothor game.)
Replace to:
void Jit64::ps_sel(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(Paired)
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
fpr.Lock(a, b, c, d);
fpr.BindToRegister(d, d==b || d==c || d==a, true);
MOVAPD(XMM1, fpr.R(a));
XORPD(XMM0, R(XMM0));
CMPPD(XMM1, R(XMM0), 1); //less-than = 111111
MOVAPD(XMM0, R(XMM1));
ANDPD(XMM1, fpr.R(b));
ANDNPD(XMM0, fpr.R©);
ORPD(XMM0, R(XMM1));
MOVAPD(fpr.RX(d), R(XMM0));
fpr.UnlockAll();
}
--------------------------------
(2) Jit64::psq_l() function in Jit_LoadStorePaired.cpp: (Need check just in case. Already JitIL is implimented.)
Line 174 insert to:
if (inst.W) OR(32, R(EDX), Imm8(8));
Line 151-156 comment out.
================================
[4] Do not know code
================================
(1) 'min_filter' is defined as 3 bit field in BPMemory.h. but '.min_filter == 8' and '.min_filter != 8' in some VertexManager.cpp and Render.cpp.
================================
[5] Simple optimize hint for a bit speedup (Free gift. Some pickup from my optimized code.)
================================
(1) Line 134 in x64Emitter.h:
bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
Replace to:
bool IsImm() const {return (scale & 0xfc) == 0xf0; }
--------------------------------
(2) LoadBPReg() function in BPMemory.cpp replace to:
void LoadBPReg(u32 value0)
{
//handle the mask register
int opcode = value0 >> 24;
int oldval = ((u32*)&bpmem)[opcode];
int newval = oldval ^ ((value0 ^ oldval) & bpmem.bpMask); // (oldval & ~bpmem.bpMask) | (value0 & bpmem.bpMask);
if (opcode != 0xFE) {
//reset the mask register
bpmem.bpMask = 0xFFFFFF;
int changes = (oldval ^ newval) & 0xFFFFFF;
BPCmd bp = {opcode, changes, newval};
BPWritten(bp);
} else {
bpmem.bpMask = newval;
}
}
--------------------------------
(3) Matrix44::Multiply() function in MathUtil.cpp replace to:
inline void MatrixMul4(const float *a, const float *b, float *result)
{
const __m128 b0 = _mm_load_ps(b + 0);
const __m128 b1 = _mm_load_ps(b + 4);
const __m128 b2 = _mm_load_ps(b + 8);
const __m128 b3 = _mm_load_ps(b + 12);
for (int i = 0; i < 4; ++i) {
__m128 a_ = _mm_load_ps(a + i*4);
__m128 a0 = _mm_shuffle_ps(a_, a_, _MM_SHUFFLE(0, 0, 0, 0));
__m128 a1 = _mm_shuffle_ps(a_, a_, _MM_SHUFFLE(1, 1, 1, 1));
__m128 a2 = _mm_shuffle_ps(a_, a_, _MM_SHUFFLE(2, 2, 2, 2));
__m128 a3 = _mm_shuffle_ps(a_, a_, _MM_SHUFFLE(3, 3, 3, 3));
a0 = _mm_mul_ps(a0, b0);
a1 = _mm_mul_ps(a1, b1);
a2 = _mm_mul_ps(a2, b2);
a3 = _mm_mul_ps(a3, b3);
_mm_store_ps(result + i*4, _mm_add_ps(_mm_add_ps(a0, a1), _mm_add_ps(a2, a3)));
}
}
void Matrix44::Multiply(const Matrix44 &a, const Matrix44 &b, Matrix44 &result)
{
MatrixMul4(a.data, b.data, result.data);
}
And, Line 198 in MathUtil.h replace to:
float GC_ALIGNED16(data[16]);
================================
This report is created based on 'dolphin 3.0-735'.
If I found anothor bug, report again.
Was this report write to google code 'dolphin-emu issues' better?
If you have question or impressions, please reply to this by 'simple english'.
Sorry my bad english from japan.
================================
[1] Lucid bug
================================
(1) Line 442 in OpcodeDecoding.cpp:
*DataReadU32xFuncs = *DataReadU32xFuncs_SSSE3;
Replace to:
for (int i = 0; i < 16; ++i) DataReadU32xFuncs[i] = DataReadU32xFuncs_SSSE3[i];
--------------------------------
(2) Line 443 in JitAsmCommon.cpp:
MOVD_xmm(XMM0, M(&psTemp[0]));
Replace to:
MOVD_xmm(XMM0, R(EAX));
--------------------------------
(3) Line 124 in DSPLLE.cpp:
Common::SetCurrentThreadAffinity(1 << core_id);
Replace to:
Common::SetCurrentThreadAffinity(1 << (core_id - 1));
--------------------------------
(4) Line 152 in VertexLoader_Color.cpp
const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR]+colIndex);
Replace to:
const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
================================
[2] Probably bug (careless miss?)
================================
(1) Line 287 in Jit_Util.cpp:
if (false && cpu_info.bSSSE3) {
This line must move to line 272.
--------------------------------
(2) Line 142 in Jit_LoadStore.cpp:
gpr.Flush(FLUSH_ALL);
This line can delete.
================================
[3] Might solve bug (Expected to speed up)
================================
(1) Jit64::ps_sel() function in Jit_Paired.cpp: (I confirmed fix 'sengoku musou 3', more need check at anothor game.)
Replace to:
void Jit64::ps_sel(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(Paired)
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
fpr.Lock(a, b, c, d);
fpr.BindToRegister(d, d==b || d==c || d==a, true);
MOVAPD(XMM1, fpr.R(a));
XORPD(XMM0, R(XMM0));
CMPPD(XMM1, R(XMM0), 1); //less-than = 111111
MOVAPD(XMM0, R(XMM1));
ANDPD(XMM1, fpr.R(b));
ANDNPD(XMM0, fpr.R©);
ORPD(XMM0, R(XMM1));
MOVAPD(fpr.RX(d), R(XMM0));
fpr.UnlockAll();
}
--------------------------------
(2) Jit64::psq_l() function in Jit_LoadStorePaired.cpp: (Need check just in case. Already JitIL is implimented.)
Line 174 insert to:
if (inst.W) OR(32, R(EDX), Imm8(8));
Line 151-156 comment out.
================================
[4] Do not know code
================================
(1) 'min_filter' is defined as 3 bit field in BPMemory.h. but '.min_filter == 8' and '.min_filter != 8' in some VertexManager.cpp and Render.cpp.
================================
[5] Simple optimize hint for a bit speedup (Free gift. Some pickup from my optimized code.)
================================
(1) Line 134 in x64Emitter.h:
bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
Replace to:
bool IsImm() const {return (scale & 0xfc) == 0xf0; }
--------------------------------
(2) LoadBPReg() function in BPMemory.cpp replace to:
void LoadBPReg(u32 value0)
{
//handle the mask register
int opcode = value0 >> 24;
int oldval = ((u32*)&bpmem)[opcode];
int newval = oldval ^ ((value0 ^ oldval) & bpmem.bpMask); // (oldval & ~bpmem.bpMask) | (value0 & bpmem.bpMask);
if (opcode != 0xFE) {
//reset the mask register
bpmem.bpMask = 0xFFFFFF;
int changes = (oldval ^ newval) & 0xFFFFFF;
BPCmd bp = {opcode, changes, newval};
BPWritten(bp);
} else {
bpmem.bpMask = newval;
}
}
--------------------------------
(3) Matrix44::Multiply() function in MathUtil.cpp replace to:
inline void MatrixMul4(const float *a, const float *b, float *result)
{
const __m128 b0 = _mm_load_ps(b + 0);
const __m128 b1 = _mm_load_ps(b + 4);
const __m128 b2 = _mm_load_ps(b + 8);
const __m128 b3 = _mm_load_ps(b + 12);
for (int i = 0; i < 4; ++i) {
__m128 a_ = _mm_load_ps(a + i*4);
__m128 a0 = _mm_shuffle_ps(a_, a_, _MM_SHUFFLE(0, 0, 0, 0));
__m128 a1 = _mm_shuffle_ps(a_, a_, _MM_SHUFFLE(1, 1, 1, 1));
__m128 a2 = _mm_shuffle_ps(a_, a_, _MM_SHUFFLE(2, 2, 2, 2));
__m128 a3 = _mm_shuffle_ps(a_, a_, _MM_SHUFFLE(3, 3, 3, 3));
a0 = _mm_mul_ps(a0, b0);
a1 = _mm_mul_ps(a1, b1);
a2 = _mm_mul_ps(a2, b2);
a3 = _mm_mul_ps(a3, b3);
_mm_store_ps(result + i*4, _mm_add_ps(_mm_add_ps(a0, a1), _mm_add_ps(a2, a3)));
}
}
void Matrix44::Multiply(const Matrix44 &a, const Matrix44 &b, Matrix44 &result)
{
MatrixMul4(a.data, b.data, result.data);
}
And, Line 198 in MathUtil.h replace to:
float GC_ALIGNED16(data[16]);
================================
This report is created based on 'dolphin 3.0-735'.
If I found anothor bug, report again.
Was this report write to google code 'dolphin-emu issues' better?
If you have question or impressions, please reply to this by 'simple english'.
Sorry my bad english from japan.