// Pixel UberShader for 0 texgens, per-pixel depth
int idot(int3 x, int3 y)
{
	int3 tmp = x * y;
	return tmp.x + tmp.y + tmp.z;
}
int idot(int4 x, int4 y)
{
	int4 tmp = x * y;
	return tmp.x + tmp.y + tmp.z + tmp.w;
}

int  iround(float  x) { return int (round(x)); }
int2 iround(float2 x) { return int2(round(x)); }
int3 iround(float3 x) { return int3(round(x)); }
int4 iround(float4 x) { return int4(round(x)); }

SamplerState samp[8] : register(s0);

Texture2DArray Tex[8] : register(t0);

cbuffer PSBlock : register(b0) {
	int4 color[4];
	int4 k[4];
	int4 alphaRef;
	float4 texdim[8];
	int4 czbias[2];
	int4 cindscale[2];
	int4 cindmtx[6];
	int4 cfogcolor;
	int4 cfogi;
	float4 cfogf[2];
	float4 czslope;
	float2 cefbscale;
	uint  bpmem_genmode;
	uint  bpmem_alphaTest;
	uint  bpmem_fogParam3;
	uint  bpmem_fogRangeBase;
	uint  bpmem_dstalpha;
	uint  bpmem_ztex_op;
	bool  bpmem_late_ztest;
	bool  bpmem_rgba6_format;
	bool  bpmem_dither;
	bool  bpmem_bounding_box;
	uint4 bpmem_pack1[16];
	uint4 bpmem_pack2[8];
	int4  konstLookup[32];
};

#define bpmem_combiners(i) (bpmem_pack1[(i)].xy)
#define bpmem_tevind(i) (bpmem_pack1[(i)].z)
#define bpmem_iref(i) (bpmem_pack1[(i)].w)
#define bpmem_tevorder(i) (bpmem_pack2[(i)].x)
#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)

struct VS_OUTPUT {
	 float4 pos : POSITION;
	 float4 colors_0 : COLOR0;
	 float4 colors_1 : COLOR1;
	 float4 clipPos : TEXCOORD0;
	 float clipDist0 : SV_ClipDistance0;
	 float clipDist1 : SV_ClipDistance1;
};
uint bitfieldExtract(uint val, int off, int size) {
	// This built-in function is only support in OpenGL 4.0+ and ES 3.1+
	// Microsoft's HLSL compiler automatically optimises this to a bitfield extract instruction.
	uint mask = uint((1 << size) - 1);
	return uint(val >> off) & mask;
}

int4 sampleTexture(uint sampler_num, float2 uv) {
  // This is messy, but DirectX, OpenGl 3.3 and Opengl ES 3.0 doesn't support dynamic indexing of the sampler array
  // With any luck the shader compiler will optimise this if the hardware supports dynamic indexing.
  switch(sampler_num) {
  case 0u: return iround(Tex[0].Sample(samp[0], float3(uv, 0.0)) * 255.0);
  case 1u: return iround(Tex[1].Sample(samp[1], float3(uv, 0.0)) * 255.0);
  case 2u: return iround(Tex[2].Sample(samp[2], float3(uv, 0.0)) * 255.0);
  case 3u: return iround(Tex[3].Sample(samp[3], float3(uv, 0.0)) * 255.0);
  case 4u: return iround(Tex[4].Sample(samp[4], float3(uv, 0.0)) * 255.0);
  case 5u: return iround(Tex[5].Sample(samp[5], float3(uv, 0.0)) * 255.0);
  case 6u: return iround(Tex[6].Sample(samp[6], float3(uv, 0.0)) * 255.0);
  case 7u: return iround(Tex[7].Sample(samp[7], float3(uv, 0.0)) * 255.0);
  }
}

int4 Swizzle(uint s, int4 color) {
  // AKA: Color Channel Swapping

  int4 ret;
  ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)];
  ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)];
  ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)];
  ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)];
  return ret;
}

int Wrap(int coord, uint mode) {
  if (mode == 0u) // ITW_OFF
    return coord;
  else if (mode < 6u) // ITW_256 to ITW_16
    return coord & (0xfffe >> mode);
  else // ITW_0
    return 0;
}

// TEV's Linear Interpolate, plus bias, add/subtract and scale
int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) {
 // Scale C from 0..255 to 0..256
  C += C >> 7;

 // Add bias to D
  if (bias == 1u) D += 128;
  else if (bias == 2u) D -= 128;

  int lerp = (A << 8) + (B - A)*C;
  if (shift != 3u) {
    lerp = lerp << shift;
    D = D << shift;
  }

  if ((shift == 3u) == alpha)
    lerp = lerp + (op ? 127 : 128);

  int result = lerp >> 8;

  // Add/Subtract D
  if(op) // Subtract
    result = D - result;
  else // Add
    result = D + result;

  // Most of the Shift was moved inside the lerp for improved percision
  // But we still do the divide by 2 here
  if (shift == 3u)
    result = result >> 1;
  return result;
}

// TEV's Linear Interpolate, plus bias, add/subtract and scale
int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) {
 // Scale C from 0..255 to 0..256
  C += C >> 7;

 // Add bias to D
  if (bias == 1u) D += 128;
  else if (bias == 2u) D -= 128;

  int3 lerp = (A << 8) + (B - A)*C;
  if (shift != 3u) {
    lerp = lerp << shift;
    D = D << shift;
  }

  if ((shift == 3u) == alpha)
    lerp = lerp + (op ? 127 : 128);

  int3 result = lerp >> 8;

  // Add/Subtract D
  if(op) // Subtract
    result = D - result;
  else // Add
    result = D + result;

  // Most of the Shift was moved inside the lerp for improved percision
  // But we still do the divide by 2 here
  if (shift == 3u)
    result = result >> 1;
  return result;
}

// Implements operations 0-5 of tev's compare mode,
// which are common to both color and alpha channels
bool tevCompare(uint op, int3 color_A, int3 color_B) {
  switch (op) {
  case 0u: // TEVCMP_R8_GT
    return (color_A.r > color_B.r);
  case 1u: // TEVCMP_R8_EQ
    return (color_A.r == color_B.r);
  case 2u: // TEVCMP_GR16_GT
    int A_16 = (color_A.r | (color_A.g << 8));
    int B_16 = (color_B.r | (color_B.g << 8));
    return A_16 > B_16;
  case 3u: // TEVCMP_GR16_EQ
    return (color_A.r == color_B.r && color_A.g == color_B.g);
  case 4u: // TEVCMP_BGR24_GT
    int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16));
    int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16));
    return A_24 > B_24;
  case 5u: // TEVCMP_BGR24_EQ
    return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b);
  default:
    return false;
  }
}

// Helper function for Alpha Test
bool alphaCompare(int a, int b, uint compare) {
  switch (compare) {
  case 0u: // NEVER
    return false;
  case 1u: // LESS
    return a < b;
  case 2u: // EQUAL
    return a == b;
  case 3u: // LEQUAL
    return a <= b;
  case 4u: // GREATER
    return a > b;
  case 5u: // NEQUAL;
    return a != b;
  case 6u: // GEQUAL
    return a >= b;
  case 7u: // ALWAYS
    return true;
  }
}

struct State {
  int4 Reg[4];
  int4 TexColor;
  int AlphaBump;
};
struct StageState {
  uint stage;
  uint order;
  uint cc;
  uint ac;
};

int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1);
int4 getKonstColor(State s, StageState ss);

int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) {
  switch (index) {
  case 0u: // prev.rgb
    return s.Reg[0].rgb;
  case 1u: // prev.aaa
    return s.Reg[0].aaa;
  case 2u: // c0.rgb
    return s.Reg[1].rgb;
  case 3u: // c0.aaa
    return s.Reg[1].aaa;
  case 4u: // c1.rgb
    return s.Reg[2].rgb;
  case 5u: // c1.aaa
    return s.Reg[2].aaa;
  case 6u: // c2.rgb
    return s.Reg[3].rgb;
  case 7u: // c2.aaa
    return s.Reg[3].aaa;
  case 8u:
    return s.TexColor.rgb;
  case 9u:
    return s.TexColor.aaa;
  case 10u:
    return getRasColor(s, ss, colors_0, colors_1).rgb;
  case 11u:
    return getRasColor(s, ss, colors_0, colors_1).aaa;
  case 12u: // One
    return int3(255, 255, 255);
  case 13u: // Half
    return int3(128, 128, 128);
  case 14u:
    return getKonstColor(s, ss).rgb;
  case 15u: // Zero
    return int3(0, 0, 0);
  }
}

int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) {
  switch (index) {
  case 0u: // prev.a
    return s.Reg[0].a;
  case 1u: // c0.a
    return s.Reg[1].a;
  case 2u: // c1.a
    return s.Reg[2].a;
  case 3u: // c2.a
    return s.Reg[3].a;
  case 4u:
    return s.TexColor.a;
  case 5u:
    return getRasColor(s, ss, colors_0, colors_1).a;
  case 6u:
    return getKonstColor(s, ss).a;
  case 7u: // Zero
    return 0;
  }
}

int4 getTevReg(in State s, uint index) {
  switch (index) {
  case 0u: // prev
    return s.Reg[0];
  case 1u: // c0
    return s.Reg[1];
  case 2u: // c1
    return s.Reg[2];
  case 3u: // c2
    return s.Reg[3];
  default: // prev
    return s.Reg[0];
  }
}

void setRegColor(inout State s, uint index, int3 color) {
  switch (index) {
  case 0u: // prev
    s.Reg[0].rgb = color;
    break;
  case 1u: // c0
    s.Reg[1].rgb = color;
    break;
  case 2u: // c1
    s.Reg[2].rgb = color;
    break;
  case 3u: // c2
    s.Reg[3].rgb = color;
    break;
  }
}

void setRegAlpha(inout State s, uint index, int alpha) {
  switch (index) {
  case 0u: // prev
    s.Reg[0].a = alpha;
    break;
  case 1u: // c0
    s.Reg[1].a = alpha;
    break;
  case 2u: // c1
    s.Reg[2].a = alpha;
    break;
  case 3u: // c2
    s.Reg[3].a = alpha;
    break;
  }
}

void main(
  out float4 ocol0 : SV_Target0,
  out float4 ocol1 : SV_Target1,
  
  out float depth : SV_Depth,
  in float4 rawpos : SV_Position,
  in  float4 colors_0 : COLOR0,
  in  float4 colors_1 : COLOR1
,
  in  float4 clipPos : TEXCOORD0,
  in float clipDist0 : SV_ClipDistance0
,
  in float clipDist1 : SV_ClipDistance1

        ) {
  int3 tevcoord = int3(0, 0, 0);
  State s;
  s.TexColor = int4(0, 0, 0, 0);
  s.AlphaBump = 0;

  s.Reg[0] = color[0];
  s.Reg[1] = color[1];
  s.Reg[2] = color[2];
  s.Reg[3] = color[3];
  uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4);

  // Main tev loop
  [loop]
  for(uint stage = 0u; stage <= num_stages; stage++)
  {
    StageState ss;
    ss.stage = stage;
    ss.cc = bpmem_combiners(stage).x;
    ss.ac = bpmem_combiners(stage).y;
    ss.order = bpmem_tevorder(stage>>1);
    if ((stage & 1u) == 1u)
      ss.order = ss.order >> 12;

    // This is the Meat of TEV
    {
      // Color Combiner
      uint color_a = bitfieldExtract(ss.cc, 12, 4);
      uint color_b = bitfieldExtract(ss.cc, 8, 4);
      uint color_c = bitfieldExtract(ss.cc, 4, 4);
      uint color_d = bitfieldExtract(ss.cc, 0, 4);
      uint color_bias = bitfieldExtract(ss.cc, 16, 2);
      bool color_op = bool(bitfieldExtract(ss.cc, 18, 1));
      bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1));
      uint color_shift = bitfieldExtract(ss.cc, 20, 2);
      uint color_dest = bitfieldExtract(ss.cc, 22, 2);
      uint color_compare_op = color_shift << 1 | uint(color_op);

      int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255);
      int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255);
      int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255);
      int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d);  // 10 bits + sign

      int3 color;
      if(color_bias != 3u) { // Normal mode
        color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift);
      } else { // Compare mode
        // op 6 and 7 do a select per color channel
        if (color_compare_op == 6u) {
          // TEVCMP_RGB8_GT
          color.r = (color_A.r > color_B.r) ? color_C.r : 0;
          color.g = (color_A.g > color_B.g) ? color_C.g : 0;
          color.b = (color_A.b > color_B.b) ? color_C.b : 0;
        } else if (color_compare_op == 7u) {
          // TEVCMP_RGB8_EQ
          color.r = (color_A.r == color_B.r) ? color_C.r : 0;
          color.g = (color_A.g == color_B.g) ? color_C.g : 0;
          color.b = (color_A.b == color_B.b) ? color_C.b : 0;
        } else {
          // The remaining ops do one compare which selects all 3 channels
          color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0);
        }
        color = color_D + color;
      }

      // Clamp result
      if (color_clamp)
        color = clamp(color, 0, 255);
      else
        color = clamp(color, -1024, 1023);

      // Write result to the correct input register of the next stage
      setRegColor(s, color_dest, color);

      // Alpha Combiner
      uint alpha_a = bitfieldExtract(ss.ac, 13, 3);
      uint alpha_b = bitfieldExtract(ss.ac, 10, 3);
      uint alpha_c = bitfieldExtract(ss.ac, 7, 3);
      uint alpha_d = bitfieldExtract(ss.ac, 4, 3);
      uint alpha_bias = bitfieldExtract(ss.ac, 16, 2);
      bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1));
      bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1));
      uint alpha_shift = bitfieldExtract(ss.ac, 20, 2);
      uint alpha_dest = bitfieldExtract(ss.ac, 22, 2);
      uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op);

      int alpha_A;
      int alpha_B;
      if (alpha_bias != 3u || alpha_compare_op > 5u) {
        // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5
        alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255;
        alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255;
      };
      int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255;
      int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign


      int alpha;
      if(alpha_bias != 3u) { // Normal mode
        alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift);
      } else { // Compare mode
        if (alpha_compare_op == 6u) {
          // TEVCMP_A8_GT
          alpha = (alpha_A > alpha_B) ? alpha_C : 0;
        } else if (alpha_compare_op == 7u) {
          // TEVCMP_A8_EQ
          alpha = (alpha_A == alpha_B) ? alpha_C : 0;
        } else {
          // All remaining alpha compare ops actually compare the color channels
          alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0;
        }
        alpha = alpha_D + alpha;
      }

      // Clamp result
      if (alpha_clamp)
        alpha = clamp(alpha, 0, 255);
      else
        alpha = clamp(alpha, -1024, 1023);

      // Write result to the correct input register of the next stage
      setRegAlpha(s, alpha_dest, alpha);
    }
  } // Main tev loop

  int4 TevResult;
  TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz;
  TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w;
  TevResult &= 255;

  int zCoord = int((1.0 - rawpos.z) * 16777216.0);
  zCoord = clamp(zCoord, 0, 0xFFFFFF);

  // ZFreeze
  if ((bpmem_genmode & 524288u) != 0u) {
    float2 screenpos = rawpos.xy * cefbscale.xy;
    zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y);
 }

  // Depth Texture
  int early_zCoord = zCoord;
  if (bpmem_ztex_op != 0u) {
    int ztex = int(czbias[1].w); // fixed bias

    // Whatever texture was in our last stage, it's now our depth texture
    ztex += idot(s.TexColor.xyzw, czbias[0].xyzw);
    ztex += (bpmem_ztex_op == 1u) ? zCoord : 0;
    zCoord = ztex & 0xFFFFFF;
  }

  // If early depth is enabled, write to zbuffer before depth textures
  // If early depth isn't enabled, we write to the zbuffer here
  int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord;
  depth = 1.0 - float(zbuffer_zCoord) / 16777216.0;
  // Alpha Test
  if (bpmem_alphaTest != 0u) {
    bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3));
    bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3));

    // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans.
    switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) {
    case 0u: // AND
      if (comp0 && comp1) break; else discard; break;
    case 1u: // OR
      if (comp0 || comp1) break; else discard; break;
    case 2u: // XOR
      if (comp0 != comp1) break; else discard; break;
    case 3u: // XNOR
      if (comp0 == comp1) break; else discard; break;
    }
  }

  if (bpmem_dither) {
    // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering
    // Here the matrix is encoded into the two factor constants
    int2 dither = int2(rawpos.xy) & 1;
    TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2);
  }

  // Fog
  uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3);
  if (fog_function != 0u) {
    // TODO: This all needs to be converted from float to fixed point
    float ze;
    if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) {
      // perspective
      // ze = A/(B - (Zs >> B_SHF)
      ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w));
    } else {
      // orthographic
      // ze = a*Zs    (here, no B_SHF)
      ze = cfogf[1].x * float(zCoord) / 16777216.0;
    }

    if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) {
      // x_adjust = sqrt((x-center)^2 + k^2)/k
      // ze *= x_adjust
      // TODO Instead of this theoretical calculation, we should use the
      //      coefficient table given in the fog range BP registers!
      float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; 
      x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z;
      ze *= x_adjust;
    }

    float fog = clamp(ze - cfogf[1].z, 0.0, 1.0);

    if (fog_function > 3u) {
      switch (fog_function) {
      case 4u:
        fog = 1.0 - exp2(-8.0 * fog);
        break;
      case 5u:
        fog = 1.0 - exp2(-8.0 * fog * fog);
        break;
      case 6u:
        fog = exp2(-8.0 * (1.0 - fog));
        break;
      case 7u:
        fog = 1.0 - fog;
        fog = exp2(-8.0 * fog * fog);
        break;
      }
    }

    int ifog = iround(fog * 256.0);
    TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8;
  }

  if (bpmem_rgba6_format)
    ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0;
  else
    ocol0.rgb = float3(TevResult.rgb) / 255.0;

  if (bpmem_dstalpha != 0u)
    ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0;
  else
    ocol0.a = float(TevResult.a >> 2) / 63.0;
  
  // Dest alpha override (dual source blending)
  // Colors will be blended against the alpha from ocol1 and
  // the alpha from ocol0 will be written to the framebuffer.
  ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0);
}

int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) {
  // Select Ras for stage
  uint ras = bitfieldExtract(ss.order, 7, 3);
  if (ras < 2u) { // Lighting Channel 0 or 1
    int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0);
    uint swap = bitfieldExtract(ss.ac, 0, 2);
    return Swizzle(swap, color);
  } else if (ras == 5u) { // Alpha Bumb
    return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump);
  } else if (ras == 6u) { // Normalzied Alpha Bump
    int normalized = s.AlphaBump | s.AlphaBump >> 5;
    return int4(normalized, normalized, normalized, normalized);
  } else {
    return int4(0, 0, 0, 0);
  }
}

int4 getKonstColor(State s, StageState ss) {
  // Select Konst for stage
  // TODO: a switch case might be better here than an dynamically  // indexed uniform lookup
  uint tevksel = bpmem_tevksel(ss.stage>>1);
  if ((ss.stage & 1u) == 0u)
    return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a);
  else
    return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a);
}