// If we have ARM NEON support, pick off 48 bytes at a time:
while (srclen >= 48)
{
	uint8x16x3_t str;
	uint8x16x4_t res;

	// Load 48 bytes and deinterleave:
	str = vld3q_u8((uint8_t *)c);

	// Reshuffle:
	res = enc_reshuffle(str);

	// Translate reshuffled bytes to the Base64 alphabet:
	res = enc_translate(res);

	// Interleave and store result:
	vst4q_u8((uint8_t *)o, res);

	c += 48;	// 3 * 16 bytes of input
	o += 64;	// 4 * 16 bytes of output
	outl += 64;
	srclen -= 48;
}