// Pixel UberShader for 0 texgens, per-pixel depth int idot(int3 x, int3 y) { int3 tmp = x * y; return tmp.x + tmp.y + tmp.z; } int idot(int4 x, int4 y) { int4 tmp = x * y; return tmp.x + tmp.y + tmp.z + tmp.w; } int iround(float x) { return int (round(x)); } int2 iround(float2 x) { return int2(round(x)); } int3 iround(float3 x) { return int3(round(x)); } int4 iround(float4 x) { return int4(round(x)); } SamplerState samp[8] : register(s0); Texture2DArray Tex[8] : register(t0); cbuffer PSBlock : register(b0) { int4 color[4]; int4 k[4]; int4 alphaRef; float4 texdim[8]; int4 czbias[2]; int4 cindscale[2]; int4 cindmtx[6]; int4 cfogcolor; int4 cfogi; float4 cfogf[2]; float4 czslope; float2 cefbscale; uint bpmem_genmode; uint bpmem_alphaTest; uint bpmem_fogParam3; uint bpmem_fogRangeBase; uint bpmem_dstalpha; uint bpmem_ztex_op; bool bpmem_late_ztest; bool bpmem_rgba6_format; bool bpmem_dither; bool bpmem_bounding_box; uint4 bpmem_pack1[16]; uint4 bpmem_pack2[8]; int4 konstLookup[32]; }; #define bpmem_combiners(i) (bpmem_pack1[(i)].xy) #define bpmem_tevind(i) (bpmem_pack1[(i)].z) #define bpmem_iref(i) (bpmem_pack1[(i)].w) #define bpmem_tevorder(i) (bpmem_pack2[(i)].x) #define bpmem_tevksel(i) (bpmem_pack2[(i)].y) struct VS_OUTPUT { float4 pos : POSITION; float4 colors_0 : COLOR0; float4 colors_1 : COLOR1; float4 clipPos : TEXCOORD0; float clipDist0 : SV_ClipDistance0; float clipDist1 : SV_ClipDistance1; }; uint bitfieldExtract(uint val, int off, int size) { // This built-in function is only support in OpenGL 4.0+ and ES 3.1+ // Microsoft's HLSL compiler automatically optimises this to a bitfield extract instruction. uint mask = uint((1 << size) - 1); return uint(val >> off) & mask; } int4 sampleTexture(uint sampler_num, float2 uv) { // This is messy, but DirectX, OpenGl 3.3 and Opengl ES 3.0 doesn't support dynamic indexing of the sampler array // With any luck the shader compiler will optimise this if the hardware supports dynamic indexing. switch(sampler_num) { case 0u: return iround(Tex[0].Sample(samp[0], float3(uv, 0.0)) * 255.0); case 1u: return iround(Tex[1].Sample(samp[1], float3(uv, 0.0)) * 255.0); case 2u: return iround(Tex[2].Sample(samp[2], float3(uv, 0.0)) * 255.0); case 3u: return iround(Tex[3].Sample(samp[3], float3(uv, 0.0)) * 255.0); case 4u: return iround(Tex[4].Sample(samp[4], float3(uv, 0.0)) * 255.0); case 5u: return iround(Tex[5].Sample(samp[5], float3(uv, 0.0)) * 255.0); case 6u: return iround(Tex[6].Sample(samp[6], float3(uv, 0.0)) * 255.0); case 7u: return iround(Tex[7].Sample(samp[7], float3(uv, 0.0)) * 255.0); } } int4 Swizzle(uint s, int4 color) { // AKA: Color Channel Swapping int4 ret; ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; return ret; } int Wrap(int coord, uint mode) { if (mode == 0u) // ITW_OFF return coord; else if (mode < 6u) // ITW_256 to ITW_16 return coord & (0xfffe >> mode); else // ITW_0 return 0; } // TEV's Linear Interpolate, plus bias, add/subtract and scale int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { // Scale C from 0..255 to 0..256 C += C >> 7; // Add bias to D if (bias == 1u) D += 128; else if (bias == 2u) D -= 128; int lerp = (A << 8) + (B - A)*C; if (shift != 3u) { lerp = lerp << shift; D = D << shift; } if ((shift == 3u) == alpha) lerp = lerp + (op ? 127 : 128); int result = lerp >> 8; // Add/Subtract D if(op) // Subtract result = D - result; else // Add result = D + result; // Most of the Shift was moved inside the lerp for improved percision // But we still do the divide by 2 here if (shift == 3u) result = result >> 1; return result; } // TEV's Linear Interpolate, plus bias, add/subtract and scale int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { // Scale C from 0..255 to 0..256 C += C >> 7; // Add bias to D if (bias == 1u) D += 128; else if (bias == 2u) D -= 128; int3 lerp = (A << 8) + (B - A)*C; if (shift != 3u) { lerp = lerp << shift; D = D << shift; } if ((shift == 3u) == alpha) lerp = lerp + (op ? 127 : 128); int3 result = lerp >> 8; // Add/Subtract D if(op) // Subtract result = D - result; else // Add result = D + result; // Most of the Shift was moved inside the lerp for improved percision // But we still do the divide by 2 here if (shift == 3u) result = result >> 1; return result; } // Implements operations 0-5 of tev's compare mode, // which are common to both color and alpha channels bool tevCompare(uint op, int3 color_A, int3 color_B) { switch (op) { case 0u: // TEVCMP_R8_GT return (color_A.r > color_B.r); case 1u: // TEVCMP_R8_EQ return (color_A.r == color_B.r); case 2u: // TEVCMP_GR16_GT int A_16 = (color_A.r | (color_A.g << 8)); int B_16 = (color_B.r | (color_B.g << 8)); return A_16 > B_16; case 3u: // TEVCMP_GR16_EQ return (color_A.r == color_B.r && color_A.g == color_B.g); case 4u: // TEVCMP_BGR24_GT int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); return A_24 > B_24; case 5u: // TEVCMP_BGR24_EQ return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); default: return false; } } // Helper function for Alpha Test bool alphaCompare(int a, int b, uint compare) { switch (compare) { case 0u: // NEVER return false; case 1u: // LESS return a < b; case 2u: // EQUAL return a == b; case 3u: // LEQUAL return a <= b; case 4u: // GREATER return a > b; case 5u: // NEQUAL; return a != b; case 6u: // GEQUAL return a >= b; case 7u: // ALWAYS return true; } } struct State { int4 Reg[4]; int4 TexColor; int AlphaBump; }; struct StageState { uint stage; uint order; uint cc; uint ac; }; int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); int4 getKonstColor(State s, StageState ss); int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { switch (index) { case 0u: // prev.rgb return s.Reg[0].rgb; case 1u: // prev.aaa return s.Reg[0].aaa; case 2u: // c0.rgb return s.Reg[1].rgb; case 3u: // c0.aaa return s.Reg[1].aaa; case 4u: // c1.rgb return s.Reg[2].rgb; case 5u: // c1.aaa return s.Reg[2].aaa; case 6u: // c2.rgb return s.Reg[3].rgb; case 7u: // c2.aaa return s.Reg[3].aaa; case 8u: return s.TexColor.rgb; case 9u: return s.TexColor.aaa; case 10u: return getRasColor(s, ss, colors_0, colors_1).rgb; case 11u: return getRasColor(s, ss, colors_0, colors_1).aaa; case 12u: // One return int3(255, 255, 255); case 13u: // Half return int3(128, 128, 128); case 14u: return getKonstColor(s, ss).rgb; case 15u: // Zero return int3(0, 0, 0); } } int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { switch (index) { case 0u: // prev.a return s.Reg[0].a; case 1u: // c0.a return s.Reg[1].a; case 2u: // c1.a return s.Reg[2].a; case 3u: // c2.a return s.Reg[3].a; case 4u: return s.TexColor.a; case 5u: return getRasColor(s, ss, colors_0, colors_1).a; case 6u: return getKonstColor(s, ss).a; case 7u: // Zero return 0; } } int4 getTevReg(in State s, uint index) { switch (index) { case 0u: // prev return s.Reg[0]; case 1u: // c0 return s.Reg[1]; case 2u: // c1 return s.Reg[2]; case 3u: // c2 return s.Reg[3]; default: // prev return s.Reg[0]; } } void setRegColor(inout State s, uint index, int3 color) { switch (index) { case 0u: // prev s.Reg[0].rgb = color; break; case 1u: // c0 s.Reg[1].rgb = color; break; case 2u: // c1 s.Reg[2].rgb = color; break; case 3u: // c2 s.Reg[3].rgb = color; break; } } void setRegAlpha(inout State s, uint index, int alpha) { switch (index) { case 0u: // prev s.Reg[0].a = alpha; break; case 1u: // c0 s.Reg[1].a = alpha; break; case 2u: // c1 s.Reg[2].a = alpha; break; case 3u: // c2 s.Reg[3].a = alpha; break; } } void main( out float4 ocol0 : SV_Target0, out float4 ocol1 : SV_Target1, out float depth : SV_Depth, in float4 rawpos : SV_Position, in float4 colors_0 : COLOR0, in float4 colors_1 : COLOR1 , in float4 clipPos : TEXCOORD0, in float clipDist0 : SV_ClipDistance0 , in float clipDist1 : SV_ClipDistance1 ) { int3 tevcoord = int3(0, 0, 0); State s; s.TexColor = int4(0, 0, 0, 0); s.AlphaBump = 0; s.Reg[0] = color[0]; s.Reg[1] = color[1]; s.Reg[2] = color[2]; s.Reg[3] = color[3]; uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); // Main tev loop [loop] for(uint stage = 0u; stage <= num_stages; stage++) { StageState ss; ss.stage = stage; ss.cc = bpmem_combiners(stage).x; ss.ac = bpmem_combiners(stage).y; ss.order = bpmem_tevorder(stage>>1); if ((stage & 1u) == 1u) ss.order = ss.order >> 12; // This is the Meat of TEV { // Color Combiner uint color_a = bitfieldExtract(ss.cc, 12, 4); uint color_b = bitfieldExtract(ss.cc, 8, 4); uint color_c = bitfieldExtract(ss.cc, 4, 4); uint color_d = bitfieldExtract(ss.cc, 0, 4); uint color_bias = bitfieldExtract(ss.cc, 16, 2); bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); uint color_shift = bitfieldExtract(ss.cc, 20, 2); uint color_dest = bitfieldExtract(ss.cc, 22, 2); uint color_compare_op = color_shift << 1 | uint(color_op); int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign int3 color; if(color_bias != 3u) { // Normal mode color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); } else { // Compare mode // op 6 and 7 do a select per color channel if (color_compare_op == 6u) { // TEVCMP_RGB8_GT color.r = (color_A.r > color_B.r) ? color_C.r : 0; color.g = (color_A.g > color_B.g) ? color_C.g : 0; color.b = (color_A.b > color_B.b) ? color_C.b : 0; } else if (color_compare_op == 7u) { // TEVCMP_RGB8_EQ color.r = (color_A.r == color_B.r) ? color_C.r : 0; color.g = (color_A.g == color_B.g) ? color_C.g : 0; color.b = (color_A.b == color_B.b) ? color_C.b : 0; } else { // The remaining ops do one compare which selects all 3 channels color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); } color = color_D + color; } // Clamp result if (color_clamp) color = clamp(color, 0, 255); else color = clamp(color, -1024, 1023); // Write result to the correct input register of the next stage setRegColor(s, color_dest, color); // Alpha Combiner uint alpha_a = bitfieldExtract(ss.ac, 13, 3); uint alpha_b = bitfieldExtract(ss.ac, 10, 3); uint alpha_c = bitfieldExtract(ss.ac, 7, 3); uint alpha_d = bitfieldExtract(ss.ac, 4, 3); uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); int alpha_A; int alpha_B; if (alpha_bias != 3u || alpha_compare_op > 5u) { // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; }; int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign int alpha; if(alpha_bias != 3u) { // Normal mode alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); } else { // Compare mode if (alpha_compare_op == 6u) { // TEVCMP_A8_GT alpha = (alpha_A > alpha_B) ? alpha_C : 0; } else if (alpha_compare_op == 7u) { // TEVCMP_A8_EQ alpha = (alpha_A == alpha_B) ? alpha_C : 0; } else { // All remaining alpha compare ops actually compare the color channels alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; } alpha = alpha_D + alpha; } // Clamp result if (alpha_clamp) alpha = clamp(alpha, 0, 255); else alpha = clamp(alpha, -1024, 1023); // Write result to the correct input register of the next stage setRegAlpha(s, alpha_dest, alpha); } } // Main tev loop int4 TevResult; TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; TevResult &= 255; int zCoord = int((1.0 - rawpos.z) * 16777216.0); zCoord = clamp(zCoord, 0, 0xFFFFFF); // ZFreeze if ((bpmem_genmode & 524288u) != 0u) { float2 screenpos = rawpos.xy * cefbscale.xy; zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); } // Depth Texture int early_zCoord = zCoord; if (bpmem_ztex_op != 0u) { int ztex = int(czbias[1].w); // fixed bias // Whatever texture was in our last stage, it's now our depth texture ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; zCoord = ztex & 0xFFFFFF; } // If early depth is enabled, write to zbuffer before depth textures // If early depth isn't enabled, we write to the zbuffer here int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; depth = 1.0 - float(zbuffer_zCoord) / 16777216.0; // Alpha Test if (bpmem_alphaTest != 0u) { bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { case 0u: // AND if (comp0 && comp1) break; else discard; break; case 1u: // OR if (comp0 || comp1) break; else discard; break; case 2u: // XOR if (comp0 != comp1) break; else discard; break; case 3u: // XNOR if (comp0 == comp1) break; else discard; break; } } if (bpmem_dither) { // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering // Here the matrix is encoded into the two factor constants int2 dither = int2(rawpos.xy) & 1; TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); } // Fog uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); if (fog_function != 0u) { // TODO: This all needs to be converted from float to fixed point float ze; if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { // perspective // ze = A/(B - (Zs >> B_SHF) ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); } else { // orthographic // ze = a*Zs (here, no B_SHF) ze = cfogf[1].x * float(zCoord) / 16777216.0; } if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { // x_adjust = sqrt((x-center)^2 + k^2)/k // ze *= x_adjust // TODO Instead of this theoretical calculation, we should use the // coefficient table given in the fog range BP registers! float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; ze *= x_adjust; } float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); if (fog_function > 3u) { switch (fog_function) { case 4u: fog = 1.0 - exp2(-8.0 * fog); break; case 5u: fog = 1.0 - exp2(-8.0 * fog * fog); break; case 6u: fog = exp2(-8.0 * (1.0 - fog)); break; case 7u: fog = 1.0 - fog; fog = exp2(-8.0 * fog * fog); break; } } int ifog = iround(fog * 256.0); TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; } if (bpmem_rgba6_format) ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; else ocol0.rgb = float3(TevResult.rgb) / 255.0; if (bpmem_dstalpha != 0u) ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; else ocol0.a = float(TevResult.a >> 2) / 63.0; // Dest alpha override (dual source blending) // Colors will be blended against the alpha from ocol1 and // the alpha from ocol0 will be written to the framebuffer. ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); } int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { // Select Ras for stage uint ras = bitfieldExtract(ss.order, 7, 3); if (ras < 2u) { // Lighting Channel 0 or 1 int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); uint swap = bitfieldExtract(ss.ac, 0, 2); return Swizzle(swap, color); } else if (ras == 5u) { // Alpha Bumb return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); } else if (ras == 6u) { // Normalzied Alpha Bump int normalized = s.AlphaBump | s.AlphaBump >> 5; return int4(normalized, normalized, normalized, normalized); } else { return int4(0, 0, 0, 0); } } int4 getKonstColor(State s, StageState ss) { // Select Konst for stage // TODO: a switch case might be better here than an dynamically // indexed uniform lookup uint tevksel = bpmem_tevksel(ss.stage>>1); if ((ss.stage & 1u) == 0u) return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); else return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); }