[PATCH v6 01/12] Implement internal functions for efficient CRC computation.

Add two new internal functions (IFN_CRC, IFN_CRC_REV), to provide faster
CRC generation.
One performs bit-forward and the other bit-reversed CRC computation.
If CRC optabs are supported, they are used for the CRC computation.
Otherwise, table-based CRC is generated.
The supported data and CRC sizes are 8, 16, 32, and 64 bits.
The polynomial is without the leading 1.
A table with 256 elements is used to store precomputed CRCs.
For the reflection of inputs and the output, a simple algorithm involving
SHIFT, AND, and OR operations is used.

gcc/

	* doc/md.texi (crc@var{m}@var{n}4, crc_rev@var{m}@var{n}4): Document.
	* expr.cc (calculate_crc): New function.
	(assemble_crc_table): Likewise.
	(generate_crc_table): Likewise.
	(calculate_table_based_CRC): Likewise.
	(expand_crc_table_based): Likewise.
	(gen_common_operation_to_reflect): Likewise.
	(reflect_64_bit_value): Likewise.
	(reflect_32_bit_value): Likewise.
	(reflect_16_bit_value): Likewise.
	(reflect_8_bit_value): Likewise.
	(generate_reflecting_code_standard): Likewise.
	(expand_reversed_crc_table_based): Likewise.
	* expr.h (generate_reflecting_code_standard): New function declaration.
	(expand_crc_table_based): Likewise.
	(expand_reversed_crc_table_based): Likewise.
	* internal-fn.cc: (crc_direct): Define.
	(direct_crc_optab_supported_p): Likewise.
	(expand_crc_optab_fn): New function
	* internal-fn.def (CRC, CRC_REV): New internal functions.
	* optabs.def (crc_optab, crc_rev_optab): New optabs.

	Signed-off-by: Mariam Arutunian <mariamarutunian@gmail.com>
	Co-authored-by: Joern Rennecke <joern.rennecke@embecosm.com>
	Co-authored-by: Jeff Law <jlaw@ventanamicro.com>
This commit is contained in:
Mariam Arutunian 2024-11-11 12:48:34 -07:00 committed by Jeff Law
parent bcb764ec7c
commit bb46d05ad6
6 changed files with 446 additions and 0 deletions

View file

@ -8578,6 +8578,20 @@ Return 1 if operand 1 is a normal floating point number and 0
otherwise. @var{m} is a scalar floating point mode. Operand 0
has mode @code{SImode}, and operand 1 has mode @var{m}.
@cindex @code{crc@var{m}@var{n}4} instruction pattern
@item @samp{crc@var{m}@var{n}4}
Calculate a bit-forward CRC using operands 1, 2 and 3,
then store the result in operand 0.
Operands 1 is the initial CRC, operands 2 is the data and operands 3 is the
polynomial without leading 1.
Operands 0, 1 and 3 have mode @var{n} and operand 2 has mode @var{m}, where
both modes are integers. The size of CRC to be calculated is determined by the
mode; for example, if @var{n} is @code{HImode}, a CRC16 is calculated.
@cindex @code{crc_rev@var{m}@var{n}4} instruction pattern
@item @samp{crc_rev@var{m}@var{n}4}
Similar to @samp{crc@var{m}@var{n}4}, but calculates a bit-reversed CRC.
@end table
@end ifset

View file

@ -14177,3 +14177,350 @@ int_expr_size (const_tree exp)
return tree_to_shwi (size);
}
/* Calculate CRC for the initial CRC and given POLYNOMIAL.
CRC_BITS is CRC size. */
static unsigned HOST_WIDE_INT
calculate_crc (unsigned HOST_WIDE_INT crc,
unsigned HOST_WIDE_INT polynomial,
unsigned short crc_bits)
{
unsigned HOST_WIDE_INT msb = HOST_WIDE_INT_1U << (crc_bits - 1);
crc = crc << (crc_bits - 8);
for (short i = 8; i > 0; --i)
{
if (crc & msb)
crc = (crc << 1) ^ polynomial;
else
crc <<= 1;
}
/* Zero out bits in crc beyond the specified number of crc_bits. */
if (crc_bits < sizeof (crc) * CHAR_BIT)
crc &= (HOST_WIDE_INT_1U << crc_bits) - 1;
return crc;
}
/* Assemble CRC table with 256 elements for the given POLYNOM and CRC_BITS with
given ID.
ID is the identifier of the table, the name of the table is unique,
contains CRC size and the polynomial.
POLYNOM is the polynomial used to calculate the CRC table's elements.
CRC_BITS is the size of CRC, may be 8, 16, ... . */
rtx
assemble_crc_table (tree id, unsigned HOST_WIDE_INT polynom,
unsigned short crc_bits)
{
unsigned table_el_n = 0x100;
tree ar = build_array_type (make_unsigned_type (crc_bits),
build_index_type (size_int (table_el_n - 1)));
tree decl = build_decl (UNKNOWN_LOCATION, VAR_DECL, id, ar);
SET_DECL_ASSEMBLER_NAME (decl, id);
DECL_ARTIFICIAL (decl) = 1;
rtx tab = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (id));
TREE_ASM_WRITTEN (decl) = 0;
/* Initialize the table. */
vec<tree, va_gc> *initial_values;
vec_alloc (initial_values, table_el_n);
for (size_t i = 0; i < table_el_n; ++i)
{
unsigned HOST_WIDE_INT crc = calculate_crc (i, polynom, crc_bits);
tree element = build_int_cstu (make_unsigned_type (crc_bits), crc);
vec_safe_push (initial_values, element);
}
DECL_INITIAL (decl) = build_constructor_from_vec (ar, initial_values);
TREE_READONLY (decl) = 1;
TREE_STATIC (decl) = 1;
if (TREE_PUBLIC (id))
{
TREE_PUBLIC (decl) = 1;
make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
}
mark_decl_referenced (decl);
varpool_node::finalize_decl (decl);
return tab;
}
/* Generate CRC lookup table by calculating CRC for all possible
8-bit data values. The table is stored with a specific name in the read-only
static data section.
POLYNOM is the polynomial used to calculate the CRC table's elements.
CRC_BITS is the size of CRC, may be 8, 16, ... . */
rtx
generate_crc_table (unsigned HOST_WIDE_INT polynom, unsigned short crc_bits)
{
gcc_assert (crc_bits <= 64);
/* Buf size - 24 letters + 6 '_'
+ 20 numbers (2 for crc bit size + 2 for 0x + 16 for 64-bit polynomial)
+ 1 for \0. */
char buf[51];
sprintf (buf, "crc_table_for_crc_%u_polynomial_" HOST_WIDE_INT_PRINT_HEX,
crc_bits, polynom);
tree id = maybe_get_identifier (buf);
if (id)
return gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (id));
id = get_identifier (buf);
return assemble_crc_table (id, polynom, crc_bits);
}
/* Generate table-based CRC code for the given CRC, INPUT_DATA and the
POLYNOMIAL (without leading 1).
First, using POLYNOMIAL's value generates CRC table of 256 elements,
then generates the assembly for the following code,
where crc_bit_size and data_bit_size may be 8, 16, 32, 64, depending on CRC:
for (int i = 0; i < data_bit_size / 8; i++)
crc = (crc << 8) ^ crc_table[(crc >> (crc_bit_size - 8))
^ (data >> (data_bit_size - (i + 1) * 8)
& 0xFF))];
So to take values from the table, we need 8-bit data.
If input data size is not 8, then first we extract upper 8 bits,
then the other 8 bits, and so on. */
void
calculate_table_based_CRC (rtx *crc, const rtx &input_data,
const rtx &polynomial,
machine_mode crc_mode, machine_mode data_mode)
{
unsigned short crc_bit_size = GET_MODE_BITSIZE (crc_mode).to_constant ();
unsigned short data_size = GET_MODE_SIZE (data_mode).to_constant ();
machine_mode mode = GET_MODE (*crc);
rtx tab = generate_crc_table (UINTVAL (polynomial), crc_bit_size);
for (unsigned short i = 0; i < data_size; i++)
{
/* crc >> (crc_bit_size - 8). */
*crc = force_reg (crc_mode, *crc);
rtx op1 = expand_shift (RSHIFT_EXPR, mode, *crc, crc_bit_size - 8,
NULL_RTX, 1);
/* data >> (8 * (GET_MODE_SIZE (data_mode).to_constant () - i - 1)). */
unsigned range_8 = 8 * (data_size - i - 1);
rtx data = force_reg (data_mode, input_data);
data = expand_shift (RSHIFT_EXPR, mode, data, range_8, NULL_RTX, 1);
/* data >> (8 * (GET_MODE_SIZE (data_mode)
.to_constant () - i - 1)) & 0xFF. */
rtx data_final = expand_and (mode, data,
gen_int_mode (255, data_mode), NULL_RTX);
/* (crc >> (crc_bit_size - 8)) ^ data_8bit. */
rtx in = expand_binop (mode, xor_optab, op1, data_final,
NULL_RTX, 1, OPTAB_WIDEN);
/* ((crc >> (crc_bit_size - 8)) ^ data_8bit) & 0xFF. */
rtx index = expand_and (mode, in, gen_int_mode (255, mode),
NULL_RTX);
int log_crc_size = exact_log2 (GET_MODE_SIZE (crc_mode).to_constant ());
index = expand_shift (LSHIFT_EXPR, mode, index,
log_crc_size, NULL_RTX, 0);
rtx addr = gen_reg_rtx (Pmode);
convert_move (addr, index, 1);
addr = expand_binop (Pmode, add_optab, addr, tab, NULL_RTX,
0, OPTAB_DIRECT);
/* crc_table[(crc >> (crc_bit_size - 8)) ^ data_8bit] */
rtx tab_el = validize_mem (gen_rtx_MEM (crc_mode, addr));
/* (crc << 8) if CRC is larger than 8, otherwise crc = 0. */
rtx high = NULL_RTX;
if (crc_bit_size != 8)
high = expand_shift (LSHIFT_EXPR, mode, *crc, 8, NULL_RTX, 0);
else
high = gen_int_mode (0, mode);
/* crc = (crc << 8)
^ crc_table[(crc >> (crc_bit_size - 8)) ^ data_8bit]; */
*crc = expand_binop (mode, xor_optab, tab_el, high, NULL_RTX, 1,
OPTAB_WIDEN);
}
}
/* Generate table-based CRC code for the given CRC, INPUT_DATA and the
POLYNOMIAL (without leading 1).
CRC is OP1, data is OP2 and the polynomial is OP3.
This must generate a CRC table and an assembly for the following code,
where crc_bit_size and data_bit_size may be 8, 16, 32, 64:
uint_crc_bit_size_t
crc_crc_bit_size (uint_crc_bit_size_t crc_init,
uint_data_bit_size_t data, size_t size)
{
uint_crc_bit_size_t crc = crc_init;
for (int i = 0; i < data_bit_size / 8; i++)
crc = (crc << 8) ^ crc_table[(crc >> (crc_bit_size - 8))
^ (data >> (data_bit_size - (i + 1) * 8)
& 0xFF))];
return crc;
} */
void
expand_crc_table_based (rtx op0, rtx op1, rtx op2, rtx op3,
machine_mode data_mode)
{
gcc_assert (!CONST_INT_P (op0));
gcc_assert (CONST_INT_P (op3));
machine_mode crc_mode = GET_MODE (op0);
rtx crc = gen_reg_rtx (crc_mode);
convert_move (crc, op1, 0);
calculate_table_based_CRC (&crc, op2, op3, crc_mode, data_mode);
convert_move (op0, crc, 0);
}
/* Generate the common operation for reflecting values:
*OP = (*OP & AND1_VALUE) << SHIFT_VAL | (*OP & AND2_VALUE) >> SHIFT_VAL; */
void
gen_common_operation_to_reflect (rtx *op,
unsigned HOST_WIDE_INT and1_value,
unsigned HOST_WIDE_INT and2_value,
unsigned shift_val)
{
rtx op1 = expand_and (GET_MODE (*op), *op,
gen_int_mode (and1_value, GET_MODE (*op)), NULL_RTX);
op1 = expand_shift (LSHIFT_EXPR, GET_MODE (*op), op1, shift_val, op1, 0);
rtx op2 = expand_and (GET_MODE (*op), *op,
gen_int_mode (and2_value, GET_MODE (*op)), NULL_RTX);
op2 = expand_shift (RSHIFT_EXPR, GET_MODE (*op), op2, shift_val, op2, 1);
*op = expand_binop (GET_MODE (*op), ior_optab, op1,
op2, *op, 0, OPTAB_LIB_WIDEN);
}
/* Reflect 64-bit value for the 64-bit target. */
void
reflect_64_bit_value (rtx *op)
{
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x00000000FFFFFFFF),
HOST_WIDE_INT_C (0xFFFFFFFF00000000), 32);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x0000FFFF0000FFFF),
HOST_WIDE_INT_C (0xFFFF0000FFFF0000), 16);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x00FF00FF00FF00FF),
HOST_WIDE_INT_C (0xFF00FF00FF00FF00), 8);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x0F0F0F0F0F0F0F0F),
HOST_WIDE_INT_C (0xF0F0F0F0F0F0F0F0), 4);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x3333333333333333),
HOST_WIDE_INT_C (0xCCCCCCCCCCCCCCCC), 2);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x5555555555555555),
HOST_WIDE_INT_C (0xAAAAAAAAAAAAAAAA), 1);
}
/* Reflect 32-bit value for the 32-bit target. */
void
reflect_32_bit_value (rtx *op)
{
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x0000FFFF),
HOST_WIDE_INT_C (0xFFFF0000), 16);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x00FF00FF),
HOST_WIDE_INT_C (0xFF00FF00), 8);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x0F0F0F0F),
HOST_WIDE_INT_C (0xF0F0F0F0), 4);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x33333333),
HOST_WIDE_INT_C (0xCCCCCCCC), 2);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x55555555),
HOST_WIDE_INT_C (0xAAAAAAAA), 1);
}
/* Reflect 16-bit value for the 16-bit target. */
void
reflect_16_bit_value (rtx *op)
{
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x00FF),
HOST_WIDE_INT_C (0xFF00), 8);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x0F0F),
HOST_WIDE_INT_C (0xF0F0), 4);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x3333),
HOST_WIDE_INT_C (0xCCCC), 2);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x5555),
HOST_WIDE_INT_C (0xAAAA), 1);
}
/* Reflect 8-bit value for the 8-bit target. */
void
reflect_8_bit_value (rtx *op)
{
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x0F),
HOST_WIDE_INT_C (0xF0), 4);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x33),
HOST_WIDE_INT_C (0xCC), 2);
gen_common_operation_to_reflect (op, HOST_WIDE_INT_C (0x55),
HOST_WIDE_INT_C (0xAA), 1);
}
/* Generate instruction sequence which reflects the value of the OP
using shift, and, or operations. OP's mode may be less than word_mode. */
void
generate_reflecting_code_standard (rtx *op)
{
gcc_assert (GET_MODE_BITSIZE (GET_MODE (*op)).to_constant () >= 8
&& GET_MODE_BITSIZE (GET_MODE (*op)).to_constant () <= 64);
if (GET_MODE_BITSIZE (GET_MODE (*op)).to_constant () == 64)
reflect_64_bit_value (op);
else if (GET_MODE_BITSIZE (GET_MODE (*op)).to_constant () == 32)
reflect_32_bit_value (op);
else if (GET_MODE_BITSIZE (GET_MODE (*op)).to_constant () == 16)
reflect_16_bit_value (op);
else
reflect_8_bit_value (op);
}
/* Generate table-based reversed CRC code for the given CRC, INPUT_DATA and
the POLYNOMIAL (without leading 1).
CRC is OP1, data is OP2 and the polynomial is OP3.
This must generate CRC table and assembly for the following code,
where crc_bit_size and data_bit_size may be 8, 16, 32, 64:
uint_crc_bit_size_t
crc_crc_bit_size (uint_crc_bit_size_t crc_init,
uint_data_bit_size_t data, size_t size)
{
reflect (crc_init)
uint_crc_bit_size_t crc = crc_init;
reflect (data);
for (int i = 0; i < data_bit_size / 8; i++)
crc = (crc << 8) ^ crc_table[(crc >> (crc_bit_size - 8))
^ (data >> (data_bit_size - (i + 1) * 8) & 0xFF))];
reflect (crc);
return crc;
} */
void
expand_reversed_crc_table_based (rtx op0, rtx op1, rtx op2, rtx op3,
machine_mode data_mode,
void (*gen_reflecting_code) (rtx *op))
{
gcc_assert (!CONST_INT_P (op0));
gcc_assert (CONST_INT_P (op3));
machine_mode crc_mode = GET_MODE (op0);
rtx crc = gen_reg_rtx (crc_mode);
convert_move (crc, op1, 0);
gen_reflecting_code (&crc);
rtx data = gen_reg_rtx (data_mode);
convert_move (data, op2, 0);
gen_reflecting_code (&data);
calculate_table_based_CRC (&crc, data, op3, crc_mode, data_mode);
gen_reflecting_code (&crc);
convert_move (op0, crc, 0);
}

View file

@ -377,4 +377,10 @@ extern rtx expr_size (tree);
extern bool mem_ref_refers_to_non_mem_p (tree);
extern bool non_mem_decl_p (tree);
/* Generate table-based CRC. */
extern void generate_reflecting_code_standard (rtx *);
extern void expand_crc_table_based (rtx, rtx, rtx, rtx, machine_mode);
extern void expand_reversed_crc_table_based (rtx, rtx, rtx, rtx, machine_mode,
void (*) (rtx *));
#endif /* GCC_EXPR_H */

View file

@ -191,6 +191,7 @@ init_internal_fns ()
#define mask_fold_left_direct { 1, 1, false }
#define mask_len_fold_left_direct { 1, 1, false }
#define check_ptrs_direct { 0, 0, false }
#define crc_direct { 1, -1, true }
const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
#define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
@ -4054,6 +4055,79 @@ expand_convert_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab,
expand_fn_using_insn (stmt, icode, 1, nargs);
}
/* Expand CRC call STMT. */
static void
expand_crc_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab)
{
tree lhs = gimple_call_lhs (stmt);
tree rhs1 = gimple_call_arg (stmt, 0); // crc
tree rhs2 = gimple_call_arg (stmt, 1); // data
tree rhs3 = gimple_call_arg (stmt, 2); // polynomial
tree result_type = TREE_TYPE (lhs);
tree data_type = TREE_TYPE (rhs2);
gcc_assert (TYPE_MODE (result_type) >= TYPE_MODE (data_type));
rtx dest = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
rtx crc = expand_normal (rhs1);
rtx data = expand_normal (rhs2);
gcc_assert (TREE_CODE (rhs3) == INTEGER_CST);
rtx polynomial = gen_rtx_CONST_INT (TYPE_MODE (result_type),
TREE_INT_CST_LOW (rhs3));
/* Use target specific expansion if it exists.
Otherwise, generate table-based CRC. */
if (direct_internal_fn_supported_p (fn, tree_pair (data_type, result_type),
OPTIMIZE_FOR_SPEED))
{
class expand_operand ops[4];
create_call_lhs_operand (&ops[0], dest, TYPE_MODE (result_type));
create_input_operand (&ops[1], crc, TYPE_MODE (result_type));
create_input_operand (&ops[2], data, TYPE_MODE (data_type));
create_input_operand (&ops[3], polynomial, TYPE_MODE (result_type));
insn_code icode = convert_optab_handler (optab, TYPE_MODE (data_type),
TYPE_MODE (result_type));
expand_insn (icode, 4, ops);
assign_call_lhs (lhs, dest, &ops[0]);
}
else
{
/* We're bypassing all the operand conversions that are done in the
case when we get an icode, operands and pass that off to expand_insn.
That path has special case handling for promoted return values which
we must emulate here (is the same kind of special treatment ever
needed for input arguments here?).
In particular we do not want to store directly into a promoted
SUBREG destination, instead store into a suitably sized pseudo. */
rtx orig_dest = dest;
if (SUBREG_P (dest) && SUBREG_PROMOTED_VAR_P (dest))
dest = gen_reg_rtx (GET_MODE (dest));
/* If it's IFN_CRC generate bit-forward CRC. */
if (fn == IFN_CRC)
expand_crc_table_based (dest, crc, data, polynomial,
TYPE_MODE (data_type));
else
/* If it's IFN_CRC_REV generate bit-reversed CRC. */
expand_reversed_crc_table_based (dest, crc, data, polynomial,
TYPE_MODE (data_type),
generate_reflecting_code_standard);
/* Now get the return value where it needs to be, taking care to
ensure it's promoted appropriately if the ABI demands it.
Re-use assign_call_lhs to handle the details. */
class expand_operand ops[4];
create_call_lhs_operand (&ops[0], dest, TYPE_MODE (result_type));
ops[0].value = dest;
assign_call_lhs (lhs, orig_dest, &ops[0]);
}
}
/* Expanders for optabs that can use expand_direct_optab_fn. */
#define expand_unary_optab_fn(FN, STMT, OPTAB) \
@ -4190,6 +4264,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_cond_len_unary_optab_supported_p direct_optab_supported_p
#define direct_cond_len_binary_optab_supported_p direct_optab_supported_p
#define direct_cond_len_ternary_optab_supported_p direct_optab_supported_p
#define direct_crc_optab_supported_p convert_optab_supported_p
#define direct_mask_load_optab_supported_p convert_optab_supported_p
#define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p

View file

@ -202,6 +202,8 @@ along with GCC; see the file COPYING3. If not see
cond_len_##UNSIGNED_OPTAB, cond_len_##TYPE)
#endif
DEF_INTERNAL_OPTAB_FN (CRC, ECF_CONST | ECF_NOTHROW, crc, crc)
DEF_INTERNAL_OPTAB_FN (CRC_REV, ECF_CONST | ECF_NOTHROW, crc_rev, crc)
DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load)
DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,

View file

@ -85,6 +85,8 @@ OPTAB_CD(smsub_widen_optab, "msub$b$a4")
OPTAB_CD(umsub_widen_optab, "umsub$b$a4")
OPTAB_CD(ssmsub_widen_optab, "ssmsub$b$a4")
OPTAB_CD(usmsub_widen_optab, "usmsub$a$b4")
OPTAB_CD(crc_optab, "crc$a$b4")
OPTAB_CD(crc_rev_optab, "crc_rev$a$b4")
OPTAB_CD(vec_load_lanes_optab, "vec_load_lanes$a$b")
OPTAB_CD(vec_store_lanes_optab, "vec_store_lanes$a$b")
OPTAB_CD(vec_mask_load_lanes_optab, "vec_mask_load_lanes$a$b")