insns.pl: sanity-check that instruction encodings match operands

Error out if an encoding position is invalid, like an "r" operand matches an "xmmrm" operand. Document the instruction encoding symbols; there are too many of them by now. Add symbols 'n' and 'w' meaning immediates that are supposed to be encoded as if they were 'm' memory addresses and 'v' register numbers, respectively; this is necessary to indicate a validation exception. Remove broken ARPL "memory-like" encoding. It probably never worked anyway. This verification caught two bugs already: - VPMASKMOV[DQ] cannot omit the second operand. - Incorrect operand encoding order for VREDUCESH. Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
2025-10-10 00:25:06 -04:00 · 2025-09-30 10:48:13 -07:00
parent 54547eba35
commit 48f7170772
3 changed files with 133 additions and 43 deletions
--- a/x86/bytecode.txt
+++ b/x86/bytecode.txt
@@ -7,8 +7,22 @@ and consumed by asm/assemble.c and disasm/disasm.c.
 Values prefixed with \ are in octal, values prefixed with \x are in
 hexadecimal.

-The mnemonics are the ones used in x86/insns.txt, where applicable.
+The mnemonics are the ones used in x86/insns.dat, where applicable.

+In x86/insns.dat, the encoding slot of each operand is encoded as:
+
+	-	implicit operand (no encoding)
+	x+y	multiple encoding slots for one operand
+	r	"r" position in modr/m, or base register with "+r"
+	m	"m" position in modr/m
+	n	immediate encoded in the "m" position in modr/m
+	b	register encoded in the "m" position in modr/m
+	x	register encoded in the "x" position in modr/m + sib (MIB)
+	v	"v" register position in vex/evex
+	s	"s" registe rposition in /is4
+	w	immediate encoded in the "v" position in vex/evex
+	i	first immediate or mem_offs
+	j	second immediate or mem_offs

 Codes            Mnemonic        Explanation

--- a/x86/insns.dat
+++ b/x86/insns.dat
@@ -96,7 +96,7 @@ $bwdq MOVRS	reg#,mem#			[rm:	evex.nf0.nd0.l0.m4.o# 8a# /r]		FUTURE,SM

 ;# Load effective address
 $wdq  LEA	reg#,mem			[rm:	o# 8d /r]				8086
-$wdq  LEA	reg#,imm#			[rm:	o# 8d /r]				8086,ND
+$wdq  LEA	reg#,imm#			[rn:	o# 8d /r]				8086,ND

 ;# The basic 8 arithmetic operations
 $arith		nf=nf ADD OR nf=,ADC nf=,SBB AND SUB XOR nf=,!evex,CMP
@@ -448,7 +448,7 @@ WBNOINVD	void				[	f3 0f 09]				WBNOINVD,PRIV

 INVPCID		reg32,mem128			[rm:	66 0f38 82 /r]				INVPCID,PRIV,NOLONG
 INVPCID		reg64,mem128			[rm:	66 0f38 82 /r]				INVPCID,PRIV,LONG
-INVPCID		reg64,mem128			[rm: 	evex.nf0.nd0.l0.f3.m4.w1 f2 /r]		APX,INVPCID,PRIV,LONG
+INVPCID		reg64,mem128			[rm:	evex.nf0.nd0.l0.f3.m4.w1 f2 /r]		APX,INVPCID,PRIV,LONG
 INVLPG		mem				[m:	0f 01 /7]				486,PRIV
 $wdq  INVLPGA	ax#,reg_ecx			[--:	a#  0f 01 df]				X86_64,AMD
      INVLPGA	void				[	adf 0f 01 df]				X86_64,AMD,ND
@@ -578,7 +578,7 @@ $dq   RDGSBASE	reg#				[m:	w# f3 0f ae /1]				LONG
 $dq   WRFSBASE	reg#				[m:	w# f3 0f ae /2]				LONG
 $dq   WRGSBASE	reg#				[m:	w# f3 0f ae /3]				LONG

-$zwd  ARPL	rm16,sel#			[mr:	optw# 63 /r]				286,PROT,SM,NOLONG
+$wd   ARPL	rm16,reg#			[mr:	optw# 63 /r]				286,PROT,SM,NOLONG
 $wdq  LAR	reg#,rm_sel			[rm:	optd# 0f 02 /r]				286,PROT
 $wdq  LSL	reg#,rm_sel			[rm:	optd# 0f 03 /r]				286,PROT

@@ -984,14 +984,14 @@ FWAIT		void				[	wait]					8086
 XLATB		void				[	d7]					8086
 XLAT		void				[	d7]					8086,ND

-$bwdq CCMPscc	spec4,rm#,reg#			[vmr:	evex.scc.dfv.l0.m4.o# 38# /r	]	APX,SM1-2
-$bwdq CCMPscc	spec4,reg#,rm#			[vrm:   evex.scc.dfv.l0.m4.o# 3a# /r	]	APX,SM1-2
-$wdq  CCMPscc	spec4,rm#,sbyte#		[vmi:	evex.scc.dfv.l0.m4.o# 83  /7	ib,s ]	APX,SM1-2
-$bwdq CCMPscc	spec4,rm#,imm#			[vmi:	evex.scc.dfv.l0.m4.o# 80# /7	i#   ]	APX,SM1-2
+$bwdq CCMPscc	spec4,rm#,reg#			[wmr:	evex.scc.dfv.l0.m4.o# 38# /r	]	APX,SM1-2
+$bwdq CCMPscc	spec4,reg#,rm#			[wrm:   evex.scc.dfv.l0.m4.o# 3a# /r	]	APX,SM1-2
+$wdq  CCMPscc	spec4,rm#,sbyte#		[wmi:	evex.scc.dfv.l0.m4.o# 83  /7	ib,s ]	APX,SM1-2
+$bwdq CCMPscc	spec4,rm#,imm#			[wmi:	evex.scc.dfv.l0.m4.o# 80# /7	i#   ]	APX,SM1-2

-$bwdq CTESTscc	spec4,rm#,reg#			[vmr:	evex.scc.dfv.l0.m4.o# 84# /r	]	APX,SM1-2
-$bwdq CTESTscc	spec4,rm#,imm#			[vmi:	evex.scc.dfv.l0.m4.o# f6# /0	i#   ]	APX,SM1-2
-$bwdq CTESTscc	spec4,rm#,imm#			[vmi:	evex.scc.dfv.l0.m4.o# f6# /1	i#   ]	APX,SM1-2
+$bwdq CTESTscc	spec4,rm#,reg#			[wmr:	evex.scc.dfv.l0.m4.o# 84# /r	]	APX,SM1-2
+$bwdq CTESTscc	spec4,rm#,imm#			[wmi:	evex.scc.dfv.l0.m4.o# f6# /0	i#   ]	APX,SM1-2
+$bwdq CTESTscc	spec4,rm#,imm#			[wmi:	evex.scc.dfv.l0.m4.o# f6# /1	i#   ]	APX,SM1-2

 ;# Conditional instructions
 $wdq  CMOVcc	reg#,rm#			[rm:	o#  0f 40+c /r]				P6,SM
@@ -2957,10 +2957,10 @@ VPMASKMOVD	ymmreg,ymmreg*,mem256		[rvm:	vex.nds.256.66.0f38.w0 8c /r]		AVX2
 VPMASKMOVQ	xmmreg,xmmreg*,mem128		[rvm:	vex.nds.128.66.0f38.w1 8c /r]		AVX2
 VPMASKMOVQ	ymmreg,ymmreg*,mem256		[rvm:	vex.nds.256.66.0f38.w1 8c /r]		AVX2

-VPMASKMOVD	mem128,xmmreg*,xmmreg		[mvr:	vex.nds.128.66.0f38.w0 8e /r]		AVX2
-VPMASKMOVD	mem256,ymmreg*,ymmreg		[mvr:	vex.nds.256.66.0f38.w0 8e /r]		AVX2
-VPMASKMOVQ	mem128,xmmreg*,xmmreg		[mvr:	vex.nds.128.66.0f38.w1 8e /r]		AVX2
-VPMASKMOVQ	mem256,ymmreg*,ymmreg		[mvr:	vex.nds.256.66.0f38.w1 8e /r]		AVX2
+VPMASKMOVD	mem128,xmmreg,xmmreg		[mvr:	vex.nds.128.66.0f38.w0 8e /r]		AVX2
+VPMASKMOVD	mem256,ymmreg,ymmreg		[mvr:	vex.nds.256.66.0f38.w0 8e /r]		AVX2
+VPMASKMOVQ	mem128,xmmreg,xmmreg		[mvr:	vex.nds.128.66.0f38.w1 8e /r]		AVX2
+VPMASKMOVQ	mem256,ymmreg,ymmreg		[mvr:	vex.nds.256.66.0f38.w1 8e /r]		AVX2

 VPSLLVD		xmmreg,xmmreg*,xmmrm128		[rvm:	vex.nds.128.66.0f38.w0 47 /r]		AVX2
 VPSLLVQ		xmmreg,xmmreg*,xmmrm128		[rvm:	vex.nds.128.66.0f38.w1 47 /r]		AVX2
@@ -5749,7 +5749,7 @@ VRCPSH		xmmreg|mask|z,xmmreg*,xmmrm16|sae	[rvm:t1s: evex.nds.lig.66.map6.w0 4d /
 VREDUCEPH	xmmreg|mask|z,xmmrm128|b16,imm8		[rmi:fv: evex.128.np.0f3a.w0 56 /r ib]	AVX512FP16,AVX512VL
 VREDUCEPH	ymmreg|mask|z,ymmrm256|b16,imm8		[rmi:fv: evex.256.np.0f3a.w0 56 /r ib]	AVX512FP16,AVX512VL
 VREDUCEPH	zmmreg|mask|z,zmmrm512|b16|sae,imm8	[rmi:fv: evex.512.np.0f3a.w0 56 /r ib]	AVX512FP16
-VREDUCESH	xmmreg|mask|z,xmmreg*,xmmrm16|sae,imm8	[rmvi:t1s: evex.nds.lig.np.0f3a.w0 57 /r ib]	AVX512FP16
+VREDUCESH	xmmreg|mask|z,xmmreg*,xmmrm16|sae,imm8	[rvmi:t1s: evex.nds.lig.np.0f3a.w0 57 /r ib]	AVX512FP16
 VENDSCALEPH	xmmreg|mask|z,xmmrm128|b16,imm8		[rmi:fv: evex.128.np.0f3a.w0 08 /r ib]	AVX512FP16,AVX512VL
 VENDSCALEPH	ymmreg|mask|z,ymmrm256|b16,imm8		[rmi:fv: evex.256.np.0f3a.w0 08 /r ib]	AVX512FP16,AVX512VL
 VENDSCALEPH	zmmreg|mask|z,zmmrm512|b16|sae,imm8	[rmi:fv: evex.512.np.0f3a.w0 08 /r ib]	AVX512FP16
--- a/x86/insns.pl
+++ b/x86/insns.pl
@@ -118,7 +118,7 @@ sub startseq($$) {
    my $enc = 0;		# Legacy
    my $map = 0;		# Map 0

-    @codes = decodify(undef, $codestr, {});
+    @codes = decodify(undef, $codestr, {}, undef);

    while (defined($c0 = shift(@codes))) {
        $c1 = $codes[0];	# The immediate following code
@@ -740,7 +740,7 @@ sub format_insn($$$$) {
    my ($num, $flagsindex);
    my @bytecode;
    my ($op, @ops, @opsize, $opp, @opx, @oppx, @decos, @opevex);
-    my %oppos;
+    my $opinfo;

    return (undef, undef) if $operands eq 'ignore';

@@ -751,7 +751,8 @@ sub format_insn($$$$) {
    set_implied_flags(\%flags);

    # Generate byte code. This may modify the flags.
-    @bytecode = (decodify($opcode, $codes, \%flags, \%oppos), 0);
+    @bytecode = (decodify($opcode, $codes, \%flags, \$opinfo), 0);
+    my($oppos, $openc) = @$opinfo;
    push(@bytecode_list, [@bytecode]);
    $codes = hexstr(@bytecode);
    count_bytecodes(@bytecode);
@@ -766,8 +767,13 @@ sub format_insn($$$$) {
    @opsize = ();
    @decos = ();
    if ($operands ne 'void') {
-	my $opnum = scalar(@ops);
        foreach $op (split(/,/, $operands)) {
+	    my $opnum = scalar(@ops);
+	    my $isreg = 0;
+	    my $ismem = 0;
+	    my $ismoffs = 0;
+	    my $isimm = 0;
+	    my $isrm  = 0;
 	    my $iszero = 0;
 	    my $opsz = 0;
            @opx = ();
@@ -778,6 +784,8 @@ sub format_insn($$$$) {
                    push(@opevex, $1);
                }

+		$opp =~ s/^reg([0-9]*)na$/reg_na$1/;
+
                if ($opp =~ s/([^0-9]0?)(8|16|32|64|80|128|256|512|1024|1k)$/$1/) {
                    push(@oppx, "bits$2");
 		    $opsz = $1 + 0;
@@ -789,35 +797,68 @@ sub format_insn($$$$) {
 			$opp .= 'reg';
 		    }
 		}
-                $opp =~ s/^mem$/memory/;
+
                $opp =~ s/^memory_offs$/mem_offs/;
+		$opp =~ s/^mem$/memory/;
+
 		if ($opp =~ s/^(spec|imm)4$/$1/) {
 		    push(@oppx, 'fourbits');
+		    $isimm = 1;
 		}
-		$opp =~ s/^spec$/immediate/; # Immediate or special immediate
-                $opp =~ s/^imm$/imm_normal/; # Normal immediates only
+		$opp =~ s/^spec$/immediate/; # Special or normal immediate
+		$opp =~ s/^imm$/imm_normal/; # Normal immediate only
 		if ($opp =~ /^(unity|sbyted?word|[su]dword)$/) {
 		    push(@oppx, 'imm_normal');
+		    $isimm = 1;
+		}
+		if ($opp =~ /^imm/) {
+		    $isimm = 1;
 		}
                $opp =~ s/^([a-z]+)rm$/rm_$1/;
                $opp =~ s/^(rm|reg)$/$1_gpr/;
 		$opp =~ s/^rm_k$/rm_opmask/;
 		$opp =~ s/^kreg$/opmaskreg/;
-		my $isreg = ($opp =~ /(\brm_|\breg_|reg\b)/);
-		my $isrm  = $isreg || ($opp =~ /\bmem/);
-		my $isvec = ($opp =~ /\b[xyzt]mm/);
-		if ($isrm &&
+		if ($opp =~ /\brm_/) {
+		    $isrm = 1;
+		} elsif ($opp =~ /(\breg_|reg\b)/) {
+		    $isreg = 1;
+		} elsif ($opp =~ /\b[xyzt]?mem/) {
+		    $ismem = 1;
+		}
+		if ($opp =~ /\bmem_offs/) {
+		    $ismoffs = 1;
+		}
+		if ($opp =~ /\b[xyzt]mm/) {
+		    $isvec = 1;
+		}
+		if (($isrm || ($ismem && !$ismoffs) || $isreg) &&
 		    !(($flags{'EVEX'} && $isvec) || !$flags{'NOAPX'})) {
 		    # Register numbers >= 16 disallowed
 		    push(@oppx, 'rn_l16');
 		}
-		if ($isreg && $isvec &&
-		    defined($oppos->{'b'}) && $opnum == $oppos->{'b'}) {
+		if ($isreg && $isvec && $openc->[$opnum] =~ /b/) {
 		    $flags{'MOPVEC'}++;
 		}
                push(@opx, $opp, @oppx) if $opp;
            }
-            $op = join('|', @opx);
+
+	    # Sanity-check the encoding of this operand
+	    my $opvalid = '-';
+	    if ($isreg) {
+		$opvalid .= 'rvmsbx';
+	    } elsif ($isimm || $ismoffs) {
+		$opvalid .= 'ijnw';
+	    } elsif ($ismem || $isrm) {
+		$opvalid .= 'm';
+	    }
+
+	    foreach my $c (split(//, $openc->[$opnum])) {
+		if (index($opvalid, $c) < 0) {
+		    die "$fname:$line: $opcode: operand $opnum \"$op\": '$c' must be one of '$opvalid'\n";
+		}
+	    }
+
+            $op = join('|',@opx);
            push(@ops, $op);
 	    push(@opsize, $opsz);
            push(@decos, (@opevex ? join('|', @opevex) : '0'));
@@ -954,17 +995,17 @@ sub show_iflags($) {
 #
 # Turn a code string into a sequence of bytes
 #
-sub decodify($$$) {
+sub decodify($$$$) {
  # Although these are C-syntax strings, by convention they should have
  # only octal escapes (for directives) and hexadecimal escapes
  # (for verbatim bytes)
-    my($opcode, $codestr, $flags) = @_;
+    my($opcode, $codestr, $flags, $opinfo) = @_;
    my @codes;

    if ($codestr eq 'ignore') {
 	@codes = ();
    } elsif ($codestr =~ /^\s*\[([^\]]*)\]\s*$/) {
-        @codes = byte_code_compile($opcode, $1, $flags);
+        @codes = byte_code_compile($opcode, $1, $flags, $opinfo);
    } else {
 	# This really shouldn't happen anymore...
 	warn "$fname:$line: raw bytecodes?!\n";
@@ -1056,7 +1097,7 @@ sub tupletype($) {
 # enter it as e.g. "r+v".
 #
 sub byte_code_compile($$$$) {
-    my($opcode, $str, $flags, $oppos) = @_;
+    my($opcode, $str, $flags, $opinfo) = @_;
    my $opr;
    my $opc;
    my @codes = ();
@@ -1158,14 +1199,49 @@ sub byte_code_compile($$$$) {
    $opc = lc($4);

    $op = 0;
-    $oppos = {};
+    my $oppos = {};
+    my $openc = [];
+    if (defined($opinfo)) {
+	$$opinfo = [$oppos, $openc];
+    }
    for ($i = 0; $i < length($opr); $i++) {
        my $c = substr($opr,$i,1);
        if ($c eq '+') {
+	    die "$fname:$line: $opcode: invalid use of '+' in '$opr'\n"
+		if ($op < 1);
            $op--;
+        } elsif ($c =~ /^[rmnvwsijbx-]$/) {
+	    # n means an immediate which is encoded as a memory address,
+	    # but unlike a mem_offs it supports rel encoding on 64 bits.
+	    # w means an immediate to be encoded into the v register
+	    # position.
+	    (my $realc = $c) =~ tr/nw/mv/;
+	    $openc->[$op] = '' unless (defined($openc->[$op]));
+	    $openc->[$op] .= $c;
+	    if (defined($oppos->{$realc})) {
+		my $what = ($c eq $realc) ? "'$c'" : "[${realc}${c}]";
+		die "$fname:$line: $opcode: More than one $what operand in '$opr'\n";
+	    }
+	    $oppos->{$realc} = $op unless ($realc eq '-');
+	    $op++;
        } else {
-            $oppos->{$c} = $op++;
-        }
+	    die "$fname:$line: $opcode: Unknown operand encoding '$c'\n";
+	}
+    }
+
+    if (defined($oppos->{'m'})) {
+	if (defined($oppos->{'b'})) {
+	    die "$fname:$line: $opcode: [mn] operand mutually exclusive with 'b'\n";
+	} elsif (defined($oppos->{'x'})) {
+	    # memory operand + x register operand requires MIB
+	    $flags->{'MIB'}++;
+	}
+    }
+    if (defined($oppos->{'s'}) && defined($oppos->{'i'})) {
+	die "$fname:$line: $opcode: 's' operand mutually exclusive with 'i'\n";
+    }
+    if (defined($oppos->{'j'}) && !defined($oppos->{'i'})) {
+	die "$fname:$line: $opcode 'j' without 'i' operand\n";
    }
    $tup = tupletype($tuple);

@@ -1223,7 +1299,7 @@ sub byte_code_compile($$$$) {
            $prefix_ok = 0;
        } elsif ($op eq '/r') {
            if (!defined($oppos->{'r'}) || !defined($oppos->{'m'})) {
-                die "$fname:$line: $opcode: $op requires r and m operands\n";
+                die "$fname:$line: $opcode: $op requires 'r' and [mn] operands\n";
            }
            $opex = (($oppos->{'m'} & 4) ? 06 : 0) |
                (($oppos->{'r'} & 4) ? 05 : 0);
@@ -1234,14 +1310,14 @@ sub byte_code_compile($$$$) {
            $prefix_ok = 0;
        } elsif ($op =~ m:^/([0-7])$:) {
            if (!defined($oppos->{'m'})) {
-                die "$fname:$line: $opcode: $op requires an m operand\n";
+                die "$fname:$line: $opcode: $op requires an [mn] operand\n";
            }
            push(@codes, 06) if ($oppos->{'m'} & 4);
            push(@codes, 0200 + (($oppos->{'m'} & 3) << 3) + $1);
            $prefix_ok = 0;
 	} elsif ($op =~ m:^/([0-3]?)r([0-7])$:) {
 	    if (!defined($oppos->{'r'})) {
-                die "$fname:$line: $opcode: $op requires an r operand\n";
+                die "$fname:$line: $opcode: $op requires an 'r' operand\n";
 	    }
 	    push(@codes, 05) if ($oppos->{'r'} & 4);
 	    push(@codes, 0171);
@@ -1332,7 +1408,7 @@ sub byte_code_compile($$$$) {
 		    $m = $2+0;
 		} elsif ($oq eq 'nds' || $oq eq 'ndd' || $oq eq 'dds') {
 		    if (!defined($oppos->{'v'})) {
-			die "$fname:$line: $opcode: $vexname.$oq without 'v' operand\n";
+			die "$fname:$line: $opcode: $vexname.$oq without [vw] operand\n";
 		    }
 		    $has_nds = 1;
 		} else {
@@ -1476,7 +1552,7 @@ sub byte_code_compile($$$$) {
 		    $flags->{'ZU_E'}++;
 		} elsif ($oq =~ /^(nds|ndd|nd|dds)$/) {
 		    if (!defined($oppos->{'v'})) {
-			die "$fname:$line: $opcode: evex.$oq without 'v' operand\n";
+			die "$fname:$line: $opcode: evex.$oq without [vw] operand\n";
 		    }
 		    $nds = 1;
 		    $nd  = $oq eq 'nd';
@@ -1544,7 +1620,7 @@ sub byte_code_compile($$$$) {
        } elsif (defined $imm_codes{$op}) {
            if ($op eq 'seg') {
                if ($last_imm lt 'i') {
-                    die "$fname:$line: $opcode: seg without an immediate operand\n";
+                    die "$fname:$line: $opcode: seg without an [ij] operand\n";
                }
            } else {
                $last_imm++;