nasm/editors/nasmtok.pl

#!/usr/bin/perl
#
# Automatically produce some tables useful for a NASM major mode
#

use integer;
use strict;
use File::Spec;
use File::Find;

my $format = 'el';

if ($ARGV[0] =~ /^-(\S+)$/) {
    $format = $1;
    shift @ARGV;
}

my($outfile, $srcdir, $objdir) = @ARGV;

if (!defined($outfile)) {
    die "Usage: $0 [-format] outfile srcdir objdir\n";
}

my @vpath;

$srcdir = $srcdir || File::Spec->curdir();
$objdir = $objdir || $srcdir;
push(@vpath, $objdir) if ($objdir ne $srcdir);
push(@vpath, $srcdir);

my %tokens = ();		# Token lists per category
my %token_category = ();	# Tokens to category map

sub xpush($@) {
    my $ref = shift @_;

    $$ref = [] unless (defined($$ref));
    return push(@$$ref, @_);
}

# Search for a file, and return a file handle if successfully opened
sub open_vpath($$) {
    my($mode, $file) = @_;
    my %tried;

    # For simplicity, allow filenames to be specified
    # with Unix / syntax internally
    $file = File::Spec->catfile(split(/\//, $file));

    foreach my $d (@vpath) {
	my $fn = File::Spec->catfile($d, $file);
	next if ($tried{$fn});
	$tried{$fn}++;
	my $fh;
	return $fh if (open($fh, $mode, $fn));
    }
    return undef;
}

sub must_open($) {
    my($file) = @_;
    my $fh = open_vpath('<', $file);
    return $fh if (defined($fh));
    die "$0:$file: $!\n";
}

# Combine some specific token types
my %override = (
    'brcconst' => 'special-constant',
    'id' => 'special',
    'float' => 'function',
    'floatize' => 'function',
    'strfunc' => 'function',
    'ifunc' => 'function',
    'insn' => 'instruction',
    'reg' => 'register',
    'seg' => 'special',
    'wrt' => 'special',
    'times' => 'special');

sub addtoken($@) {
    my $type = shift @_;

    foreach my $token (@_) {
	unless (defined($token_category{$token})) {
	    $type = $override{$type} if (defined($override{$type}));
	    xpush(\$tokens{$type}, $token);
	    $token_category{$token} = $type;
	}
    }
}

sub read_tokhash_c($) {
    my($tokhash_c) = @_;

    my $th = must_open($tokhash_c);

    my $l;
    my $tokendata = 0;
    while (defined($l = <$th>)) {
	if ($l =~ /\bstruct tokendata tokendata\[/) {
	    $tokendata = 1;
	    next;
	} elsif (!$tokendata) {
	    next;
	}

	last if ($l =~ /\}\;/);

	if ($l =~ /^\s*\{\s*\"(.*?)\",.*?,\s*TOKEN_(\w+),(.*)\}/) {
	    my $token = $1;
	    my $type  = lc($2);
	    my $flags = $3;

	    $token = "{${token}}" if ($flags =~ /\bTFLAG_BRC\b/);

	    # Parametric token: omit the actual parameter(s)
	    $token =~ s/^(\{[\w-]+=).+(\})$/$1$2/;

	    if ($token !~ /^(\{[\w-]+=?\}|\w+)$/) {
		$type = 'operator';
	    } elsif ($token =~ /^__\?masm_.*\?__$/) {
		next;
	    }
	    addtoken($type, $token);
	    if ($token =~ /^__\?(.*)\?__$/) {
		# Also encode the "user" (macro) form without __?...?__
		addtoken($type, $1);
	    }
	}
    }
    close($th);
}

sub read_pptok_c($) {
    my($pptok_c) = @_;

    my $pt = must_open($pptok_c);

    my $l;
    my $pp_dir = 0;

    while (defined($l = <$pt>)) {
	if ($l =~ /\bpp_directives\[/) {
	    $pp_dir = 1;
	    next;
	} elsif (!$pp_dir) {
	    next;
	}

	last if ($l =~ /\}\;/);

	if ($l =~ /^\s*\"(.*?)\"/) {
	    addtoken('pp-directive', $1);
	}
    }
    close($pt);
}

sub read_directiv_dat($) {
    my($directiv_dat) = @_;

    my $dd = must_open($directiv_dat);

    my $l;
    my $directiv = 0;

    while (defined($l = <$dd>)) {
	if ($l =~ /^\; ---.*?(pragma)?/) {
	    $directiv = ($1 ne 'pragma');
	    next;
	} elsif (!$directiv) {
	    next;
	}

	if ($l =~ /^\s*(\w+)/) {
	    addtoken('directive', $1);
	}
    }

    close($dd);
}

my %version;
sub read_version($) {
    my($vfile) = @_;
    my $v = must_open($vfile);

    while (defined(my $vl = <$v>)) {
	if ($vl =~ /^NASM_(\w+)=(\S+)\s*$/) {
	    $version{lc($1)} = $2;
	}
    }
    close($v);
}

# This is called from the directory search in read_macros(), so
# don't use must_open() here.
sub read_macro_file($) {
    my($file) = @_;

    open(my $fh, '<', $file) or die "$0:$file: $!\n";
    while (defined(my $l = <$fh>)) {
	next unless ($l =~ /^\s*\%/);
	my @f = split(/\s+/, $l);
	next unless (scalar(@f) >= 2);
	next if ($f[1] =~ /^[\%\$][^\(]+$/); # Internal use only
	$f[1] =~ s/\(.*$//;	# Strip argument list if any
	$f[1] = lc($f[1]) if ($f[0] =~ /^\%i/);
	if ($f[0] =~ /^\%(i)?(assign|defalias|define|defstr|substr|xdefine)\b/) {
	    addtoken('smacro', $f[1]);
	} elsif ($f[0] =~ /^\%i?macro$/) {
	    addtoken('mmacro', $f[1]);
	}
    }
    close($fh);
}

sub read_macros(@) {
    my %visited;
    my @dirs = (File::Spec->curdir(), qw(macros output editors));
    @dirs = map { my $od = $_; map { File::Spec->catdir($od, $_) } @dirs } @_;
    foreach my $dir (@dirs) {
	next if ($visited{$dir});
	$visited{$dir}++;
	next unless opendir(my $dh, $dir);
	while (defined(my $fn = readdir($dh))) {
	    next unless ($fn =~ /\.mac$/i);
	    read_macro_file(File::Spec->catfile($dir, $fn));
	}
	closedir($dh);
    }
}

# Handle special tokens which may not have been picked up by the automatic
# process, because they depend on the build parameters, or are buried
# deep in C code...
sub add_special_cases() {
    # Not defined in non-snapshot builds
    addtoken('smacro', '__NASM_SNAPSHOT__', '__?NASM_SNAPSHOT?__');
}

sub make_lines($$@) {
    my $maxline = shift @_;
    my $indent  = shift @_;

    # The first line isn't explicitly indented and the last line
    # doesn't end in "\n"; assumed the surrounding formatter wants
    # do control that
    my $linepos   = 0;
    my $linewidth = $maxline - $indent;

    my $line = '';
    my @lines = ();

    foreach my $w (@_) {
	my $l = length($w);

	if ($linepos > 0 && $linepos+$l+1 >= $linewidth) {
	    $line .= "\n" . (' ' x $indent);
	    push(@lines, $line);
	    $linepos = 0;
	    $line = '';
	}
	if ($linepos > 0) {
	    $line .= ' ';
	    $linepos++;
	}
	$line .= $w;
	$linepos += $l;
    }

    if ($linepos > 0) {
	push(@lines, $line);
    }

    return @lines;
}

sub quote_for_emacs(@) {
    return map { s/[\\\"\']/\\$1/g; '"'.$_.'"' } @_;
}

# Emacs LISP
sub write_output_el {
    my($out, $outfile, $file) = @_;
    my $whoami = 'NASM '.$version{'ver'};

    print $out ";;; ${file} --- lists of NASM assembler tokens\n\n";
    print $out ";;; Commentary:\n\n";
    print $out ";; This file contains list of tokens from the NASM x86\n";
    print $out ";; assembler, automatically extracted from ${whoami}.\n";
    print $out ";;\n";
    print $out ";; This file is intended to be (require)d from a `nasm-mode\'\n";
    print $out ";; major mode definition.\n";
    print $out ";;\n";
    print $out ";; Tokens that are only recognized inside curly braces are\n";
    print $out ";; noted as such. Tokens of the form {xxx=} are parametric\n";
    print $out ";; tokens, where the token may contain additional text on\n";
    print $out ";; the right side of the = sign. For example,\n";
    print $out ";; {dfv=} should be matched by {dfv=cf,zf}.\n";
    print $out "\n";
    print $out ";;; Code:\n";

    my @types = sort keys(%tokens);

    # Write the individual token type lists
    foreach my $type (sort keys(%tokens)) {
	print $out "\n(defconst nasm-${type}\n";
	print $out "  \'(";

	print $out make_lines(78, 4, quote_for_emacs(sort @{$tokens{$type}}));
	print $out ")\n";
	print $out "  \"${whoami} ${type} tokens for `nasm-mode\'.\")\n";
    }

    # Generate a list of all the token type lists.
    print $out "\n(defconst nasm-token-lists\n";
    print $out "  \'(";
    print $out make_lines(78, 4, map { "'nasm-$_" } sort keys(%tokens));
    print $out ")\n";
    print $out "  \"List of all ${whoami} token type lists.\")\n";

    # The NASM token extracted version
    printf $out "\n(defconst nasm-token-version %s\n",
	quote_for_emacs($version{'ver'});
    print $out "  \"Version of NASM from which tokens were extracted,\n";
    print $out "as a human-readable string.\")\n";

    printf $out "\n(defconst nasm-token-version-id #x%08x\n",
	$version{'version_id'};
    print $out "  \"Version of NASM from which tokens were extracted,\n";
    print $out "as numeric identifier, for comparisons. Equivalent to the\n";
    print $out "__?NASM_VERSION_ID?__ NASM macro value.\")\n";

    printf $out "\n(defconst nasm-token-version-snapshot %s\n",
	$version{'snapshot'} || 'nil';
    print $out "  \"Daily NASM snapshot build from which tokens were extracted,\n";
    print $out "as a decimal number in YYYYMMDD format, or nil if not a\n";
    print $out "daily snapshot build.\")\n";

    # Footer
    print $out "\n(provide 'nasmtok)\n";
    print $out ";;; ${file} ends here\n";

    return 0;
}

# JSON
sub write_output_json {
    use JSON;

    my($out, $outfile, $file) = @_;
    my $whoami = 'NASM '.$version{'ver'};


    my $json = JSON->new;
    $json = $json->ascii(1)->canonical(1);

    my %ver;
    foreach my $vn (keys(%version)) {
	my $vv = $version{$vn};
	next if ($vn eq 'version_xid');
	$vn =~ s/_ver$//;
	$vn =~ s/^version_//;
	$vv = $vv + 0 if ($vn ne 'ver');
	$ver{$vn} = $vv;
    }

    print $out $json->encode({
	'$comment' => "NASM syntax information extracted from ${whoami}",
	    'tokens' => \%tokens, 'version' => \%ver});
    print $out "\n";
    return 0;
}

sub write_output($$) {
    my($format, $outfile) = @_;
    my %formats = (
	'el' => \&write_output_el,
	'json' => \&write_output_json
    );

    my $outfunc = $formats{$format};
    if (!defined($outfunc)) {
	die "$0: unknown output format: $format\n";
    }

    open(my $out, '>', $outfile)
	or die "$0:$outfile: $!\n";

    my($vol,$dir,$file) = File::Spec->splitpath($outfile);

    my $err = $outfunc->($out, $outfile, $file);
    close($out);

    if ($err) {
	unlink($outfile);
	die "$0:$outfile: error writing output\n";
    }
}

add_special_cases();
read_tokhash_c('asm/tokhash.c');
read_pptok_c('asm/pptok.c');
read_directiv_dat('asm/directiv.dat');
read_version('version.mak');
read_macros(@vpath);
write_output($format, $outfile);