/**
 * Intra decoding involves a lot of suffling for which I use compatible
 * instructions from both SSE and NEON, with a few exceptions where a common
 * ground would be too suboptimal.
 * 
 * Each decoding function is a large switch with a downward tree structure for
 * sharing code. Cases start at the leaves and branch towards the exit root.
 * While there is no way to make compilers generate proper code with vanilla
 * switches or functions, goto instructions are used for internal branches.
 * 
 * Choosing between the different possibilities of a same function is tricky,
 * in general I favor in order:
 * _ the fastest code, obviously (http://www.agner.org/optimize/#manual_instr_tab),
 * _ a short dependency chain (more freedom for compilers to reorder),
 * _ smaller code+data (avoid excessive use of pshufb),
 * _ readable code.
 * 
 * My thumb rules:
 * _ Aligned reads are favored if they incur no additional instructions.
 * _ pshufb is used iff doing otherwise would require 3+ instructions.
 * _ Favor vector over scalar code to avoid callee-save conventions.
 */

#include "edge264_internal.h"


#if defined(__SSE2__)
	#define ldleft3(v0, y1, y2, y3) shr128(ziplo16(ziplo8((u32x4)(v0) << 24, load32(P(-4, y1))), ziplo8(load32(P(-4, y2)), load32(P(-4, y3)))), 12)
	#define ldleft4(y0, y1, y2, y3) shr128(ziplo16(ziplo8(load32(P(-4, y0)), load32(P(-4, y1))), ziplo8(load32(P(-4, y2)), load32(P(-4, y3)))), 12)
	#define ldedge4x4() shrd128(ziplo16(ziplo8(load32(P(-4, 3)), load32(P(-4, 2))), ziplo8(load32(P(-4, 1)), load32(P(-4, 0)))), load64(P(-1, -1)), 12)
	#define ldleft7(v0, y1, y2, y3, y4, y5, y6, y7) shr128(ziphi32(ziplo16(ziplo8((u32x4)(v0) << 24, load32(P(-4, y1))), ziplo8(load32(P(-4, y2)), load32(P(-4, y3)))), ziplo16(ziplo8(load32(P(-4, y4)), load32(P(-4, y5))), ziplo8(load32(P(-4, y6)), load32(P(-4, y7))))), 8)
	#define ldleft8(y0, y1, y2, y3, y4, y5, y6, y7) shr128(ziphi32(ziplo16(ziplo8(load32(P(-4, y0)), load32(P(-4, y1))), ziplo8(load32(P(-4, y2)), load32(P(-4, y3)))), ziplo16(ziplo8(load32(P(-4, y4)), load32(P(-4, y5))), ziplo8(load32(P(-4, y6)), load32(P(-4, y7))))), 8)
	#define ldedge8x8lo() ({i8x16 _v7 = load64(P(-8, 7)); shuffleps(_v7, ziphi32(ziphi16(ziplo8(_v7, load64(P(-8, 6))), ziplo8(load64(P(-8, 5)), load64(P(-8, 4)))), ziplo16(ziplo8(load32(P(-4, 3)), load32(P(-4, 2))), ziplo8(load32(P(-4, 1)), load32(P(-4, 0))))), 0, 1, 2, 3);})
	#define ldleft16(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, yA, yB, yC, yD, yE, yF) ziphi64(ziphi32(ziplo16(ziplo8(load32(P(-4, y0)), load32(P(-4, y1))), ziplo8(load32(P(-4, y2)), load32(P(-4, y3)))), ziplo16(ziplo8(load32(P(-4, y4)), load32(P(-4, y5))), ziplo8(load32(P(-4, y6)), load32(P(-4, y7))))), ziphi32(ziplo16(ziplo8(load32(P(-4, y8)), load32(P(-4, y9))), ziplo8(load32(P(-4, yA)), load32(P(-4, yB)))), ziplo16(ziplo8(load32(P(-4, yC)), load32(P(-4, yD))), ziplo8(load32(P(-4, yE)), load32(P(-4, yF))))))
	#define spreadh8(a) shuffle(a, (i8x16){0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7})
	#define spreadq8(a) shuffle(a, (i8x16){0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3})
	static always_inline i8x16 lowpass8(i8x16 l, i8x16 m, i8x16 r) {return avg8(usubs8(avg8(l, r), (l ^ r) & set8(1)), m);}
#elif defined(__ARM_NEON)
	#define addlou8(a, b) (i16x8)vaddl_u8(vget_low_s8(a), vget_low_s8(b))
	#define lowpass8(l, m, r) (i8x16)vrhaddq_u8(vhaddq_u8(l, r), m)
	#define ldleft3(v0, y1, y2, y3) ({i8x16 _v = v0; _v[1] = *P(-1, y1), _v[2] = *P(-1, y2), _v[3] = *P(-1, y3); _v;})
	#define ldleft4(y0, y1, y2, y3) (i8x16){*P(-1, y0), *P(-1, y1), *P(-1, y2), *P(-1, y3)}
	#define ldedge4x4() ({i8x16 _v = load128(P(-5, -1)); _v[3] = *P(-1, 0); _v[2] = *P(-1, 1); _v[1] = *P(-1, 2); _v[0] = *P(-1, 3); _v;})
	#define ldleft7(v0, y1, y2, y3, y4, y5, y6, y7) ({i8x16 _v = v0; _v[1] = *P(-1, y1), _v[2] = *P(-1, y2), _v[3] = *P(-1, y3), _v[4] = *P(-1, y4), _v[5] = *P(-1, y5), _v[6] = *P(-1, y6), _v[7] = *P(-1, y7); _v;})
	#define ldleft8(y0, y1, y2, y3, y4, y5, y6, y7) (i8x16){*P(-1, y0), *P(-1, y1), *P(-1, y2), *P(-1, y3), *P(-1, y4), *P(-1, y5), *P(-1, y6), *P(-1, y7)}
	#define ldedge8x8lo() ({i8x16 _v = set8(*P(-1, 7)); _v[15] = *P(-1, 0); _v[14] = *P(-1, 1); _v[13] = *P(-1, 2); _v[12] = *P(-1, 3); _v[11] = *P(-1, 4); _v[10] = *P(-1, 5); _v[9] = *P(-1, 6); _v;})
	#define ldleft16(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, yA, yB, yC, yD, yE, yF) (i8x16){*P(-1, y0), *P(-1, y1), *P(-1, y2), *P(-1, y3), *P(-1, y4), *P(-1, y5), *P(-1, y6), *P(-1, y7), *P(-1, y8), *P(-1, y9), *P(-1, yA), *P(-1, yB), *P(-1, yC), *P(-1, yD), *P(-1, yE), *P(-1, yF)}
	#define sublou8(a, b) (i16x8)vsubl_u8(vget_low_s8(a), vget_low_s8(b))
	static always_inline i8x16 spreadh8(i8x16 a) {return vzip1q_s64(a, vdupq_laneq_s8(a, 7));}
	static always_inline i8x16 spreadq8(i8x16 a) {return vextq_s8(vextq_s8(a, a, 4), vdupq_laneq_s8(a, 3), 12);}
#endif


/**
 * Intra 4x4
 */
static void decode_intra4x4(int mode, uint8_t * restrict p, size_t stride, i16x8 clip) {
	INIT_P();
	i8x16 v, shuf;
	i32x4 pred;
	switch (mode) {
	default: __builtin_unreachable();
	
	case I4x4_V_8:
		pred = set32(*(int32_t *)P(0, -1));
		goto store_4x4;
	
	case I4x4_H_8:
		*(int32_t *)P(0, 0) = ((i32x4)set8(*P(-1, 0)))[0];
		*(int32_t *)P(0, 1) = ((i32x4)set8(*P(-1, 1)))[0];
		*(int32_t *)P(0, 2) = ((i32x4)set8(*P(-1, 2)))[0];
		*(int32_t *)P(0, 3) = ((i32x4)set8(*P(-1, 3)))[0];
		return;
	
	case I4x4_DC_8:
		v = ziplo32(ldleft4(0, 1, 2, 3), load32(P(0, -1)));
	dc_4x4:
		pred = broadcast8(shrru16(sumh8(v), 3), __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__);
		goto store_4x4;
	case I4x4_DCA_8: {
		i8x16 v0 = load32(P(0, -1));
		v = ziplo32(v0, v0);
		} goto dc_4x4;
	case I4x4_DCB_8: {
		i8x16 v0 = ldleft4(0, 1, 2, 3);
		v = ziplo32(v0, v0);
		} goto dc_4x4;
	case I4x4_DCAB_8:
		pred = set8(-128);
		goto store_4x4;
	
	case I4x4_DDL_8:
		v = spreadh8(load64(P(0, -1)));
		shuf = (i8x16){0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6};
		goto lowpass_4x4;
	case I4x4_DDLC_8:
		v = spreadq8(load32(P(0, -1)));
		shuf = (i8x16){0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6};
		goto lowpass_4x4;
	case I4x4_DDR_8:
		shuf = (i8x16){3, 4, 5, 6, 2, 3, 4, 5, 1, 2, 3, 4, 0, 1, 2, 3};
		goto down_right_4x4;
	case I4x4_VR_8:
		shuf = (i8x16){12, 13, 14, 15, 3, 4, 5, 6, 2, 12, 13, 14, 1, 3, 4, 5};
		goto down_right_4x4;
	case I4x4_HD_8:
		shuf = (i8x16){11, 3, 4, 5, 10, 2, 11, 3, 9, 1, 10, 2, 8, 0, 9, 1};
		goto down_right_4x4;
	case I4x4_VL_8:
		v = load64(P(0, -1));
		shuf = (i8x16){8, 9, 10, 11, 0, 1, 2, 3, 9, 10, 11, 12, 1, 2, 3, 4};
		goto lowpass_4x4;
	case I4x4_VLC_8:
		v = spreadq8(load32(P(0, -1)));
		shuf = (i8x16){8, 9, 10, 11, 0, 1, 2, 3, 9, 10, 11, 12, 1, 2, 3, 4};
		goto lowpass_4x4;
	case I4x4_HU_8:
		v = shlc128(ldleft4(3, 2, 1, 0), 1);
		shuf = (i8x16){11, 2, 10, 1, 10, 1, 9, 0, 9, 0, 8, 8, 8, 8, 8, 8};
		goto lowpass_4x4;
	down_right_4x4:
		v = ldedge4x4();
	lowpass_4x4: {
		i8x16 w = shr128(v, 1);
		i8x16 x = shr128(v, 2);
		pred = shuffle(lowpass8(ziplo64(v, v), ziplo64(w, w), ziplo64(x, v)), shuf);
	} store_4x4:
		*(int32_t *)P(0, 0) = pred[0];
		*(int32_t *)P(0, 1) = pred[1];
		*(int32_t *)P(0, 2) = pred[2];
		*(int32_t *)P(0, 3) = pred[3];
		return;
	}
}


/**
 * Intra 8x8
 * 
 * Neighbouring samples are named a to z from bottom left to top right, with
 * i being p[-1,-1] or p[-1,0] if unavailable, and j being p[-1,-1] or p[0,-1].
 */
static void decode_intra8x8(int mode, uint8_t * restrict p, size_t stride, i16x8 clip) {
	INIT_P();
	i8x16 i2a, j2s, j2y, k2z, j2q, k2r, l2s;
	i64x2 pred, p0, p1, p2, p3, p4, p5, p6, p7;
	switch (mode) {
	default: __builtin_unreachable();
	
	case I8x8_V_8:
		j2q = load128(P(-1, -1));
		l2s = shr128(j2q, 2);
		k2r = shr128(j2q, 1);
	vertical_8x8_lowpass:
		pred = lowpass8(j2q, k2r, l2s);
	store1_8x8:
		*(int64_t *)P(0, 0) = pred[0];
		*(int64_t *)P(0, 1) = pred[0];
		*(int64_t *)P(0, 2) = pred[0];
		*(int64_t *)P(0, 3) = pred[0];
		*(int64_t *)P(0, 4) = pred[0];
		*(int64_t *)P(0, 5) = pred[0];
		*(int64_t *)P(0, 6) = pred[0];
		*(int64_t *)P(0, 7) = pred[0];
		return;
	case I8x8_V_C_8: {
		i8x16 v0 = load128(P(-8, -1));
		l2s = shrc128(v0, 9);
		j2q = shr128(v0, 7);
		k2r = shr128(v0, 8);
		} goto vertical_8x8_lowpass;
	case I8x8_V_D_8:
		k2r = load128(P(0, -1));
		j2q = shlc128(k2r, 1);
		l2s = shr128(k2r, 1);
		goto vertical_8x8_lowpass;
	case I8x8_V_CD_8:
		k2r = spreadh8(load64(P(0, -1)));
		j2q = shlc128(k2r, 1);
		l2s = shr128(k2r, 1);
		goto vertical_8x8_lowpass;
	
	case I8x8_H_8:
		i2a = (i8x16){*P(-1, -1)};
	horizontal_8x8_load_left: {
		i2a = ziplo64(ldleft7(i2a, 0, 1, 2, 3, 4, 5, 6), set8(*P(-1, 7)));
		i8x16 v0 = lowpass8(shr128(i2a, 2), shr128(i2a, 1), i2a);
		p0 = broadcast8(v0, 0);
		p1 = broadcast8(v0, 1);
		p2 = broadcast8(v0, 2);
		p3 = broadcast8(v0, 3);
		p4 = broadcast8(v0, 4);
		p5 = broadcast8(v0, 5);
		p6 = broadcast8(v0, 6);
		p7 = broadcast8(v0, 7);
	} store8_8x8:
		*(int64_t *)P(0, 0) = p0[0];
		*(int64_t *)P(0, 1) = p1[0];
		*(int64_t *)P(0, 2) = p2[0];
		*(int64_t *)P(0, 3) = p3[0];
		*(int64_t *)P(0, 4) = p4[0];
		*(int64_t *)P(0, 5) = p5[0];
		*(int64_t *)P(0, 6) = p6[0];
		*(int64_t *)P(0, 7) = p7[0];
		return;
	case I8x8_H_D_8:
		i2a = (i8x16){*P(-1, 0)};
		goto horizontal_8x8_load_left;
	
	case I8x8_DC_8:
		i2a = j2s = load128(P(-1, -1));
	dc_8x8_load_left:
		i2a = ziplo64(ldleft7(i2a, 0, 1, 2, 3, 4, 5, 6), set8(*P(-1, 7)));
	dc_8x8_sum: {
		i8x16 v0 = ziplo64(j2s, i2a);
		i8x16 v1 = ziplo64(shr128(j2s, 1), shr128(i2a, 1));
		i8x16 v2 = ziplo64(shr128(j2s, 2), shr128(i2a, 2));
		pred = broadcast8(shrru16(sum8(lowpass8(v0, v1, v2)), 4), __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__);
		} goto store1_8x8;
	case I8x8_DC_C_8:
		i2a = j2s = shrc128(load128(P(-8, -1)), 7);
		goto dc_8x8_load_left;
	case I8x8_DC_D_8:
		j2s = shlc128(load128(P(0, -1)), 1);
		i2a = (i8x16){*P(-1, 0)};
		goto dc_8x8_load_left;
	case I8x8_DC_CD_8:
		j2s = shlc128(spreadh8(load64(P(0, -1))), 1);
		i2a = (i8x16){*P(-1, 0)};
		goto dc_8x8_load_left;
	case I8x8_DC_A_8:
		i2a = j2s = load128(P(-1, -1));
		goto dc_8x8_sum;
	case I8x8_DC_AC_8:
		i2a = j2s = shrc128(load128(P(-8, -1)), 7);
		goto dc_8x8_sum;
	case I8x8_DC_AD_8:
		i2a = j2s = shlc128(load128(P(0, -1)), 1);
		goto dc_8x8_sum;
	case I8x8_DC_ACD_8:
		i2a = j2s = shlc128(spreadh8(load64(P(0, -1))), 1);
		goto dc_8x8_sum;
	case I8x8_DC_B_8:
		i2a = (i8x16){*P(-1, -1)};
	dc_8x8_dup_left:
		j2s = i2a = ziplo64(ldleft7(i2a, 0, 1, 2, 3, 4, 5, 6), set8(*P(-1, 7)));
		goto dc_8x8_sum;
	case I8x8_DC_BD_8:
		i2a = (i8x16){*P(-1, 0)};
		goto dc_8x8_dup_left;
	case I8x8_DC_AB_8:
		pred = set8(-128);
		goto store1_8x8;
	
	case I8x8_DDL_8:
		j2y = load128(P(-1, -1));
		k2z = load128(P(0, -1));
	diagonal_down_left_8x8_lowpass: {
		i8x16 v0 = lowpass8(j2y, k2z, shrc128(k2z, 1));
		p0 = lowpass8(v0, shr128(v0, 1), shrc128(v0, 2));
		p1 = shr128(p0, 1);
		p2 = shr128(p0, 2);
		p3 = shr128(p0, 3);
		p4 = shr128(p0, 4);
		p5 = shr128(p0, 5);
		p6 = shr128(p0, 6);
		p7 = shr128(p0, 7);
		} goto store8_8x8;
	case I8x8_DDL_C_8: {
		i8x16 j2r = load128(P(-8, -1));
		j2y = shrc128(j2r, 7);
		k2z = shrc128(j2r, 8);
		} goto diagonal_down_left_8x8_lowpass;
	case I8x8_DDL_D_8:
		k2z = load128(P(0, -1));
		j2y = shlc128(k2z, 1);
		goto diagonal_down_left_8x8_lowpass;
	case I8x8_DDL_CD_8:
		k2z = spreadh8(load64(P(0, -1)));
		j2y = shlc128(k2z, 1);
		goto diagonal_down_left_8x8_lowpass;
	
	case I8x8_DDR_8:
		j2s = load128(P(-1, -1));
	diagonal_down_right_8x8_load_left: {
		i8x16 a2h = ldedge8x8lo();
		i8x16 a2q = shrd128(a2h, j2s, 8);
		i8x16 b2r = shrd128(a2h, j2s, 9);
		i8x16 x0 = lowpass8(shrd128(a2h, j2s, 7), a2q, b2r);
		i8x16 x1 = lowpass8(a2q, b2r, shrd128(a2h, j2s, 10));
		p7 = lowpass8(x0, x1, shr128(x1, 1));
		p6 = shr128(p7, 1);
		p5 = shr128(p7, 2);
		p4 = shr128(p7, 3);
		p3 = shr128(p7, 4);
		p2 = shr128(p7, 5);
		p1 = shr128(p7, 6);
		p0 = shr128(p7, 7);
		} goto store8_8x8;
	case I8x8_DDR_C_8:
		j2s = shrc128(load128(P(-8, -1)), 7);
		goto diagonal_down_right_8x8_load_left;
	
	case I8x8_VR_8:
		j2s = load128(P(-1, -1));
	vertical_right_8x8_load_left: {
		i8x16 a2h = ldedge8x8lo();
		i8x16 a2q = shrd128(a2h, j2s, 8);
		i8x16 b2r = shrd128(a2h, j2s, 9);
		i8x16 c2s = shrd128(a2h, j2s, 10);
		i8x16 v0 = lowpass8(a2q, b2r, c2s);
		i8x16 v1 = shl128(v0, 1);
		i8x16 v2 = lowpass8(v0, v1, shl128(v0, 2));
		p0 = shr128(avg8(v0, v1), 8);
		p1 = shr128(v2, 8);
		p2 = shrd128(shl128(v2, 8), p0, 15);
		p3 = shrd128(shl128(v2, 9), p1, 15);
		p4 = shrd128(shl128(v2, 10), p2, 15);
		p5 = shrd128(shl128(v2, 11), p3, 15);
		p6 = shrd128(shl128(v2, 12), p4, 15);
		p7 = shrd128(shl128(v2, 13), p5, 15);
		} goto store8_8x8;
	case I8x8_VR_C_8:
		j2s = shrc128(load128(P(-8, -1)), 7);
		goto vertical_right_8x8_load_left;
	
	case I8x8_HD_8: {
		j2s = load128(P(-1, -1));
		i8x16 a2h = ldedge8x8lo();
		i8x16 a2p = shrd128(a2h, j2s, 7);
		i8x16 a2q = shrd128(a2h, j2s, 8);
		i8x16 b2r = shrd128(a2h, j2s, 9);
		i8x16 v0 = lowpass8(a2p, a2q, b2r);
		i8x16 v1 = shr128(v0, 1);
		i8x16 v2 = lowpass8(v0, v1, shr128(v0, 2));
		p7 = ziplo8(avg8(v0, v1), v2);
		p6 = shr128(p7, 2);
		p5 = shr128(p7, 4);
		p4 = shr128(p7, 6);
		p3 = ziphi64(p7, v2);
		p2 = shr128(p3, 2);
		p1 = shr128(p3, 4);
		p0 = shr128(p3, 6);
		} goto store8_8x8;
	
	case I8x8_VL_8:
		j2y = load128(P(-1, -1));
		k2z = load128(P(0, -1));
	vertical_left_8x8_lowpass: {
		i8x16 v0 = lowpass8(j2y, k2z, shr128(k2z, 1));
		i8x16 v1 = shr128(v0, 1);
		p0 = avg8(v0, v1);
		p1 = lowpass8(v0, v1, shr128(v0, 2));
		p2 = shr128(p0, 1);
		p3 = shr128(p1, 1);
		p4 = shr128(p2, 1);
		p5 = shr128(p3, 1);
		p6 = shr128(p4, 1);
		p7 = shr128(p5, 1);
		} goto store8_8x8;
	case I8x8_VL_C_8: {
		i8x16 j2r = load128(P(-8, -1));
		j2y = shrc128(j2r, 7);
		k2z = shrc128(j2r, 8);
		} goto vertical_left_8x8_lowpass;
	case I8x8_VL_D_8:
		k2z = load128(P(0, -1));
		j2y = shlc128(k2z, 1);
		goto vertical_left_8x8_lowpass;
	case I8x8_VL_CD_8:
		k2z = spreadh8(load64(P(0, -1)));
		j2y = shlc128(k2z, 1);
		goto vertical_left_8x8_lowpass;
	
	case I8x8_HU_8:
		i2a = (i8x16){*P(-1, -1)};
	horizontal_up_8x8_load_left: {
		i2a = ziplo64(ldleft7(i2a, 0, 1, 2, 3, 4, 5, 6), set8(*P(-1, 7)));
		i8x16 v0 = spreadh8(lowpass8(shr128(i2a, 2), shr128(i2a, 1), i2a));
		i8x16 v1 = shr128(v0, 1);
		p0 = ziplo8(avg8(v0, v1), lowpass8(v0, v1, shr128(v0, 2)));
		p1 = shr128(p0, 2);
		p2 = shr128(p0, 4);
		p3 = shr128(p0, 6);
		p4 = ziphi64(p0, v0);
		p5 = shr128(p4, 2);
		p6 = shr128(p4, 4);
		p7 = shr128(p4, 6);
		} goto store8_8x8;
	case I8x8_HU_D_8:
		i2a = (i8x16){*P(-1, 0)};
		goto horizontal_up_8x8_load_left;
	}
}


/**
 * Intra 16x16
 */
static void decode_intra16x16(int mode, uint8_t * restrict p, size_t stride, i16x8 clip) {
	INIT_P();
	i8x16 pred, top;
	switch (mode) {
	default: __builtin_unreachable();
	
	case I16x16_V_8:
		pred = load128(P(0, -1));
		break;
	
	case I16x16_H_8:
		*(i8x16 *)P(0, 0) = set8(*P(-1, 0));
		*(i8x16 *)P(0, 1) = set8(*P(-1, 1));
		*(i8x16 *)P(0, 2) = set8(*P(-1, 2));
		*(i8x16 *)P(0, 3) = set8(*P(-1, 3));
		*(i8x16 *)P(0, 4) = set8(*P(-1, 4));
		*(i8x16 *)P(0, 5) = set8(*P(-1, 5));
		*(i8x16 *)P(0, 6) = set8(*P(-1, 6));
		*(i8x16 *)P(0, 7) = set8(*P(-1, 7));
		*(i8x16 *)P(0, 8) = set8(*P(-1, 8));
		*(i8x16 *)P(0, 9) = set8(*P(-1, 9));
		*(i8x16 *)P(0, 10) = set8(*P(-1, 10));
		*(i8x16 *)P(0, 11) = set8(*P(-1, 11));
		*(i8x16 *)P(0, 12) = set8(*P(-1, 12));
		*(i8x16 *)P(0, 13) = set8(*P(-1, 13));
		*(i8x16 *)P(0, 14) = set8(*P(-1, 14));
		*(i8x16 *)P(0, 15) = set8(*P(-1, 15));
		return;
	
	case I16x16_DC_8: {
		i8x16 t = load128(P(0, -1));
		i8x16 l = ldleft16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
		pred = broadcast8(shrru16(sumd8(t, l), 5), __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__);
		} break;
	case I16x16_DCA_8:
		top = load128(P(0, -1));
	dca_16x16_sum:
		pred = broadcast8(shrru16(sum8(top), 4), __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__);
		break;
	case I16x16_DCB_8:
		top = ldleft16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
		goto dca_16x16_sum;
	case I16x16_DCAB_8:
		pred = set8(-128);
		break;
	
	case I16x16_P_8: {
		i8x16 tl = load64(P(-1, -1));
		i8x16 tr = load64(P(8, -1));
		i8x16 lt = ldleft7(tl, 0, 1, 2, 3, 4, 5, 6);
		i8x16 lb = ldleft8(8, 9, 10, 11, 12, 13, 14, 15);
		#if defined(__SSE2__)
			i8x16 z = {};
			#ifdef __SSSE3__
				i8x16 m = {-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8};
				i16x8 mul = ziphi8(m, z);
				i16x8 v0 = maddubs(ziplo64(tl, tr), m);
				i16x8 v1 = maddubs(ziplo64(lt, lb), m);
			#else
				i16x8 n = {-8, -7, -6, -5, -4, -3, -2, -1};
				i16x8 mul = {1, 2, 3, 4, 5, 6, 7, 8};
				i16x8 v0 = (i16x8)ziplo8(tl, z) * n + (i16x8)ziplo8(tr, z) * mul;
				i16x8 v1 = (i16x8)ziplo8(lt, z) * n + (i16x8)ziplo8(lb, z) * mul;
			#endif
			i16x8 v2 = (i16x8)ziplo32(v0, v1) + (i16x8)ziphi32(v0, v1);
			i16x8 v3 = v2 + (i16x8)shr128(v2, 8);
			i16x8 HV = v3 + shufflelo(v3, 1, 0, 3, 2); // H, H, V, V
			i16x8 a = (broadcast16((i16x8)shr128(tr, 7) + (i16x8)shr128(lb, 7), 0) - -1) << 4;
		#elif defined(__ARM_NEON)
			i16x8 mul = {1, 2, 3, 4, 5, 6, 7, 8};
			i16x8 v0 = sublou8(tr, vrev64q_s8(tl)) * mul;
			i16x8 v1 = sublou8(lb, vrev64q_s8(lt)) * mul;
			i16x8 v2 = vpaddq_s16(v0, v1);
			i16x8 v3 = vpaddq_s16(v2, v2);
			i16x8 HV = (i16x8)vtrn1q_s16(v3, v3) + (i16x8)vtrn2q_s16(v3, v3); // {H, H, V, V, 0, 0, 0, 0}, -9180..9180
			i16x8 a = (broadcast16((i16x8)addlou8(tr, lb) - -1, 7)) << 4;
		#endif
		i16x8 v4 = shrrs16(HV + (HV >> 2), 4); // (5 * HV + 32) >> 6, -717..717
		i16x8 b = broadcast32(v4, 0);
		i16x8 c = broadcast32(v4, 1);
		i16x8 p1 = (a + c) - (c << 3) + (b * mul);
		i16x8 p0 = p1 - (b << 3);
		for (int i = 16; i--; p0 += c, p1 += c, p += stride)
			*(i8x16 *)p = shrpus16(p0, p1, 5);
		} return;
	}
	*(i8x16 *)P(0, 0) = pred;
	*(i8x16 *)P(0, 1) = pred;
	*(i8x16 *)P(0, 2) = pred;
	*(i8x16 *)P(0, 3) = pred;
	*(i8x16 *)P(0, 4) = pred;
	*(i8x16 *)P(0, 5) = pred;
	*(i8x16 *)P(0, 6) = pred;
	*(i8x16 *)P(0, 7) = pred;
	*(i8x16 *)P(0, 8) = pred;
	*(i8x16 *)P(0, 9) = pred;
	*(i8x16 *)P(0, 10) = pred;
	*(i8x16 *)P(0, 11) = pred;
	*(i8x16 *)P(0, 12) = pred;
	*(i8x16 *)P(0, 13) = pred;
	*(i8x16 *)P(0, 14) = pred;
	*(i8x16 *)P(0, 15) = pred;
}


/**
 * Intra Chroma
 */
static void decode_intraChroma(int mode, uint8_t * restrict p, size_t stride, i16x8 clip) {
	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
		static const i8x16 shufDC = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
		static const i8x16 shufDCA = {0, 0, 0, 0, 12, 12, 12, 12, 0, 0, 0, 0, 12, 12, 12, 12};
		static const i8x16 shufDCB = {0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12};
	#else
		static const i8x16 shufDC = {1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13};
		static const i8x16 shufDCA = {1, 1, 1, 1, 13, 13, 13, 13, 1, 1, 1, 1, 13, 13, 13, 13};
		static const i8x16 shufDCB = {1, 1, 1, 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 13, 13, 13};
	#endif
	INIT_P();
	i8x16 bt, rt, bl, rl, shuf;
	i64x2 bpred, rpred;
	switch (mode) {
	default: __builtin_unreachable();
	
	case IC8x8_DC_8:
		bt = load64(P(0, -2));
		rt = load64(P(0, -1));
		bl = ldleft8(0, 2, 4, 6, 8, 10, 12, 14);
		rl = ldleft8(1, 3, 5, 7, 9, 11, 13, 15);
		shuf = shufDC;
	chroma_dc_8x8_sum: {
		#if defined(__SSE2__)
			i8x16 b = ziplo64(bt, bl);
			i8x16 r = ziplo64(rt, rl);
			i16x8 b01 = sumh8(shuffle32(b, 0, 2, 1, 1));
			i16x8 r01 = sumh8(shuffle32(r, 0, 2, 1, 1));
			i16x8 b23 = sumh8(shuffle32(b, 3, 3, 1, 3));
			i16x8 r23 = sumh8(shuffle32(r, 3, 3, 1, 3));
			bpred = shuffle(shrru16(packs32(b01, b23), 3), shuf);
			rpred = shuffle(shrru16(packs32(r01, r23), 3), shuf);
		#elif defined(__ARM_NEON)
			i8x16 t = ziplo64(bt, rt);
			i8x16 l = ziplo64(bl, rl);
			i16x8 v0 = vpaddlq_u8(vtrn1q_s32(t, l)); // top-left sums
			i16x8 v1 = vpaddlq_u8(vtrn2q_s32(t, t)); // top-right sums
			i16x8 v2 = vpaddlq_u8(vtrn2q_s32(l, l)); // bottom-left sums
			i16x8 v3 = vpaddlq_u8(vtrn2q_s32(t, l)); // bottom-right sums
			i8x16 v4 = shrru16(vpaddq_u16(vpaddq_u16(v0, v1), vpaddq_u16(v2, v3)), 3);
			bpred = shuffle(v4, shuf);
			rpred = shuffle(shr128(v4, 2), shuf);
		#endif
		} break;
	case IC8x8_DCA_8:
		bt = bl = load64(P(0, -2));
		rt = rl = load64(P(0, -1));
		shuf = shufDCA;
		goto chroma_dc_8x8_sum;
	case IC8x8_DCB_8:
		bt = bl = ldleft8(0, 2, 4, 6, 8, 10, 12, 14);
		rt = rl = ldleft8(1, 3, 5, 7, 9, 11, 13, 15);
		shuf = shufDCB;
		goto chroma_dc_8x8_sum;
	case IC8x8_DCAB_8:
		bpred = rpred = set8(-128);
		break;
	
	case IC8x8_H_8:
		*(int64_t *)P(0, 0) = ((i64x2)set8(*P(-1, 0)))[0];
		*(int64_t *)P(0, 1) = ((i64x2)set8(*P(-1, 1)))[0];
		*(int64_t *)P(0, 2) = ((i64x2)set8(*P(-1, 2)))[0];
		*(int64_t *)P(0, 3) = ((i64x2)set8(*P(-1, 3)))[0];
		*(int64_t *)P(0, 4) = ((i64x2)set8(*P(-1, 4)))[0];
		*(int64_t *)P(0, 5) = ((i64x2)set8(*P(-1, 5)))[0];
		*(int64_t *)P(0, 6) = ((i64x2)set8(*P(-1, 6)))[0];
		*(int64_t *)P(0, 7) = ((i64x2)set8(*P(-1, 7)))[0];
		*(int64_t *)P(0, 8) = ((i64x2)set8(*P(-1, 8)))[0];
		*(int64_t *)P(0, 9) = ((i64x2)set8(*P(-1, 9)))[0];
		*(int64_t *)P(0, 10) = ((i64x2)set8(*P(-1, 10)))[0];
		*(int64_t *)P(0, 11) = ((i64x2)set8(*P(-1, 11)))[0];
		*(int64_t *)P(0, 12) = ((i64x2)set8(*P(-1, 12)))[0];
		*(int64_t *)P(0, 13) = ((i64x2)set8(*P(-1, 13)))[0];
		*(int64_t *)P(0, 14) = ((i64x2)set8(*P(-1, 14)))[0];
		*(int64_t *)P(0, 15) = ((i64x2)set8(*P(-1, 15)))[0];
		return;
	
	case IC8x8_V_8: {
		i64x2 t = {*(int64_t *)P(0, -2), *(int64_t *)P(0, -1)};
		bpred = ziplo64(t, t);
		rpred = ziphi64(t, t);
		} break;
	
	case IC8x8_P_8: {
		i8x16 btl = load128(P(-1, -2));
		i8x16 rtl = load128(P(-1, -1));
		i8x16 btr = shr128(btl, 5);
		i8x16 rtr = shr128(rtl, 5);
		i8x16 blt = ldleft3(btl, 0, 2, 4);
		i8x16 rlt = ldleft3(rtl, 1, 3, 5);
		i8x16 blb = ldleft4(8, 10, 12, 14);
		i8x16 rlb = ldleft4(9, 11, 13, 15);
		#if defined(__SSE2__)
			i8x16 n = {-4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1};
			i8x16 m = {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4};
			i8x16 v0 = ziplo32(btr, rtr);
			i8x16 v1 = ziplo32(blb, rlb);
			i16x8 v2 = maddubs(ziplo64(ziplo32(btl, rtl), ziplo32(blt, rlt)), n);
			i16x8 v3 = maddubs(ziplo64(v0, v1), m);
			i16x8 v4 = v2 + v3;
			i16x8 v5 = (((u32x4)v0 >> 24) + ((u32x4)v1 >> 24) - -1) << 4;
			i16x8 HV = v4 + shufflelo(shufflehi(v4, 1, 0, 3, 2), 1, 0, 3, 2); // Hb,Hb,Hr,Hr,Vb,Vb,Vr,Vr
			i16x8 ba = broadcast16(v5, 0); // 16..8176
			i16x8 ra = broadcast16(v5, 2); // 16..8176
		#elif defined(__ARM_NEON)
			i16x8 v0 = sublou8(ziplo32(btr, rtr), vrev32q_s8(ziplo32(btl, rtl)));
			i16x8 v1 = sublou8(ziplo32(blb, rlb), vrev32q_s8(ziplo32(blt, rlt)));
			i16x8 m = {1, 2, 3, 4, 1, 2, 3, 4};
			i16x8 v4 = vpaddq_s16(v0 * m, v1 * m);
			i16x8 HV = (i16x8)vtrn1q_s16(v4, v4) + (i16x8)vtrn2q_s16(v4, v4);
			i16x8 ba = (broadcast16((i16x8)addlou8(btr, blb) - -1, 3)) << 4;
			i16x8 ra = (broadcast16((i16x8)addlou8(rtr, rlb) - -1, 3)) << 4;
		#endif
		i16x8 hv = shrrs16(HV + (HV >> 4), 1); // (17 * HV + 16) >> 5
		i16x8 bb = broadcast32(hv, 0);
		i16x8 rb = broadcast32(hv, 1);
		i16x8 bc = broadcast32(hv, 2);
		i16x8 rc = broadcast32(hv, 3);
		i16x8 bp = ba - bc - bc - bc + bb * (i16x8){-3, -2, -1, 0, 1, 2, 3, 4};
		i16x8 rp = ra - rc - rc - rc + rb * (i16x8){-3, -2, -1, 0, 1, 2, 3, 4};
		for (int i = 8; i--; bp += bc, rp += rc, p += stride * 2) {
			i64x2 v6 = shrpus16(bp, rp, 5);
			*(int64_t *)p = v6[0];
			*(int64_t *)(p + stride) = v6[1];
		}
		} return;
	}
	*(int64_t *)P(0, 0) = bpred[0];
	*(int64_t *)P(0, 1) = rpred[0];
	*(int64_t *)P(0, 2) = bpred[0];
	*(int64_t *)P(0, 3) = rpred[0];
	*(int64_t *)P(0, 4) = bpred[0];
	*(int64_t *)P(0, 5) = rpred[0];
	*(int64_t *)P(0, 6) = bpred[0];
	*(int64_t *)P(0, 7) = rpred[0];
	*(int64_t *)P(0, 8) = bpred[1];
	*(int64_t *)P(0, 9) = rpred[1];
	*(int64_t *)P(0, 10) = bpred[1];
	*(int64_t *)P(0, 11) = rpred[1];
	*(int64_t *)P(0, 12) = bpred[1];
	*(int64_t *)P(0, 13) = rpred[1];
	*(int64_t *)P(0, 14) = bpred[1];
	*(int64_t *)P(0, 15) = rpred[1];
}