#!/usr/bin/perl

# File: fa2re
# Author: Anand Natrajan
# Contact: anand@virginia.edu, http://www.cs.virginia.edu/~an4m/
# Task: Convert an FA (finite automaton) to an RE (regular expression).
#       Algorithm from Introduction to Automata Theory, Languages and
#       Computation by Hopcroft & Ullman, Chapter 2, p33-35.
# I/P and O/P: STDIN and STDOUT

# Global variables
@states = ();              # set of states of FA
@alphabet = ();            # alphabet accepted by FA
@transitions = ();         # transitions made by FA
$start_state = 1;          # start state of FA
@final_states = ();        # set of final states of FA
@allREs = ([[0]]);         # 3D array storing REs thus far computed

# Test if the candidate element belongs in the specified list
sub belongs
{
	$candidate = $_[0]; shift;
	foreach $element (@_) { if ($element eq $candidate) { return 1; } }
	return 0;
}

# Read in input file and validate FA
foreach $line (<ARGV>)
{
	chop($line); $line =~ s/#.*//;              # remove comments
	if ($line =~ /^\s*$/) { next; }             # discard blank lines
	elsif ($line =~ /^states/)                  # parse states
	{
		($tag, $info) = split(/:\s*/, $line);   # ignore before colon
		$info =~ tr/,;/ /;                      # remove separators
		# states must be numbered uniquely and sorted
		$info =~ /[^ 0-9]/ and die "Non-numeric state number\n";
		@states = sort {$a <=> $b} split(/\s+/, $info);
	}
	elsif ($line =~ /^alpha/)                   # parse alphabet
	{
		($tag, $info) = split(/:\s*/, $line);   # ignore before colon
		$info =~ tr/,;/ /;                      # remove separators
		# alphabet characters must be unique alphanumerics and sorted
		$info =~ /[^ a-zA-Z0-9%]/ and die "Non-alphanumeric character\n";
		@alphabet = sort {$a cmp $b} split(/\s+/, $info);
	}
	elsif ($line =~ /^start/)                   # parse start state
	{
		($tag, $start_state) = split(/:\s*/, $line);  # ignore before colon
		# start state must belong to set of states
		&belongs($start_state, @states) or die "Start state invalid\n";
	}
	elsif ($line =~ /^final/)                   # parse final states
	{
		($tag, $info) = split(/:\s*/, $line);   # ignore before colon
		$info =~ tr/,;/ /;                      # remove separators
		# states must be numbered uniquely and sorted
		@final_states = sort {$a <=> $b} split(/\s+/, $info);
		foreach $fin (@final_states) {
			&belongs($fin, @final_states) or die "Final state $fin invalid\n"; }
	}
	else                                        # read in transitions
	{
		($start, $char, $final) = split(/\s+/, $line);
		&belongs($start, @states) or die "Invalid start $start in $line\n";
		&belongs($char, @alphabet) or die "Invalid char $char in $line\n";
		&belongs($final, @states) or die "Invalid final $final in $line\n";
		push @transitions, [$start, $char, $final];
	}
}

# Test printing of FA
sub test_print
{
	$" = ',';
	print "States of the FA: @states\nAlphabet of the FA: @alphabet\n"
		. "Start state of the FA: $start_state\n"
		. "Final states of the FA: @final_states\n\nTransitions of the FA:\n";
	for $i (0 .. $#transitions) {
		print "\tin $transitions[$i][0] on $transitions[$i][1] "
			. "go to $transitions[$i][2]\n"; }
	for ($k = 0; $k <= $#states+1; $k++) {
		for ($i = 0; $i <= $#states; $i++) {
			for ($j = 0; $j <= $#states; $j++) {
				print "$states[$i], $states[$j], $k: "
					. &regexp($states[$i], $states[$j], $k) . "\n"; } } }
}

# Make regular expression - the algorithm as a recursive function, unless
# the RE required already has been computed earlier. If the latter case,
# look up the RE from a 3D table.
sub regexp
{
	local $i = $_[0]; local $j = $_[1]; local $k = $_[2];
	local $str1; local $str2; local $str3; local $str4;
	# Note call parameters carefully. $i and $j are actual state labels,
	# hence used as such. Originally, $i and $j are called with @states
	# components. $k is a number denoting how many states to go through
	# [Hopcroft & Ullman]. $k = 0 implies recursion termination. Maximally,
	# $k = $#states. When indexing with $k into @states (in order to get
	# appropriate state labels), account for arrays indexing with 0.
	if ($k > 0)
	{
		$str1 = defined($allREs[$i][$states[$k-1]][$k-1])
			? $allREs[$i][$states[$k-1]][$k-1]
			: &regexp($i, $states[$k-1], $k-1);
		$str2 = defined($allREs[$states[$k-1]][$states[$k-1]][$k-1])
			? $allREs[$states[$k-1]][$states[$k-1]][$k-1]
			: &regexp($states[$k-1], $states[$k-1], $k-1);
		$str3 = defined($allREs[$states[$k-1]][$j][$k-1])
			? $allREs[$states[$k-1]][$j][$k-1]
			: &regexp($states[$k-1], $j, $k-1);
		$str4 = defined($allREs[$i][$j][$k-1])
			? $allREs[$i][$j][$k-1]
			: &regexp($i, $j, $k-1);
		# We will use % as the blank string and @ as the empty set.
		# We perform a bunch of optimisations to the RE based on experience.

		# Intersection with an empty set is always empty. e.g., @^A == @
		if ($str1 eq "@" or $str2 eq "@" or $str3 eq "@") {
			$str1 = $str2 = $str3 = ""; }

		# Remove superfluous parentheses in 4th term. e.g., (A) == A
		if ($str4 =~ /^([^)]*)$/) { $str4 =~ s/^\(//; $str4 =~ s/\)$//; }

		# Union with empty set is idempotent. e.g., A+@ == A
		if ($str4 eq "@") { $str4 = ""; }

		# If 1st or 3rd strings are blank, they are subsumed by 2nd.
		# e.g., %a% == a
		if ($str1 eq "%") { $str1 = ""; }
		if ($str3 eq "%") { $str3 = ""; }

		# 2nd string must be checked carefully against the others.
		if ($str2 ne "%" and $str2 ne "")
		{
			# If 2nd string is ORed with blank, blank discarded.
			# e.g., (a+%)* == a*
			if ($str2 =~ /\+%$/)
			{
				# Additionally, if either 1st or 3rd string match 2nd, they
				# can be discarded. e.g., (a+%)(a+%)*(a+%) == a*
				if ($str1 eq $str2) { $str1 = ""; }
				if ($str3 eq $str2) { $str3 = ""; }
				$str2 =~ s/\+%$//;                   # remove end +%
			}
			# If 2nd string is not ORed with blank, there's still a
			# chance of subsumption. e.g., (a+%)a*(a+%)+(a+%) == a*
			else
			{
				if ($str1 =~ /\+%$/)
				{
					$tmpstr = $str1; $tmpstr =~ s/\+%$//;
					if ($tmpstr eq $str2) { $str1 = ""; }
				}
				if ($str3 =~ /\+%$/)
				{
					$tmpstr = $str3; $tmpstr =~ s/\+%$//;
					if ($tmpstr eq $str2) { $str3 = ""; }
				}
			}
			if ($str1 eq "" and $str3 eq "")
			{
				$tmpstr = $str4; $tmpstr =~ s/\+%$//;
				if ($tmpstr eq $str2 or $str4 eq "%") { $str4 = ""; }
			}
			# Parenthetise 2nd string only if necessary.
			$str2 = length($str2) > 1 ? "($str2)*" : "$str2*";
		}
		# Blank out 2nd string only if either 1st or 3rd is non-blank.
		elsif ($str1 ne "" or $str3 ne "") { $str2 = ""; }

		# If 1st and 3rd string concatenate up to 4th string, blank 4th.
		# e.g., a(x)*b+ab = a(x)*b
		if ($str1.$str3 eq $str4) { $str4 = ""};

		# Parenthetise 1st and 3rd string only if necessary.
		if ($str1 =~ /\+/ and ($str2 ne "" or $str3 ne "")) {
			$str1 = "($str1)"; }
		if ($str3 =~ /\+/ and ($str2 ne "" or $str1 ne "")) {
			$str3 = "($str3)"; }

		# If 4th string matches concatenation of first 3, blank 4th. Put
		# the OR only if necessary and check if returning empty set.
		if ($str1.$str2.$str3 eq $str4) { $str4 = ""; }
		if ($str4 ne "" and $str1.$str2.$str3 ne "") { $str4 =~ s/^/+/; }
		return "$str1$str2$str3$str4" eq "" ? "@" : "$str1$str2$str3$str4";
	}
	else
	{
		$foundtrans = 0; $retstring = "";
		foreach $transref (@transitions) {          # does a transition exist?
			if ($i == $transref->[0] and $j == $transref->[2]) {
				$foundtrans = 1 and ($retstring ne "")
					? ($retstring .= "+$transref->[1]")
					: ($retstring = "$transref->[1]"); } }
		if ($i == $j) { $retstring .= ($foundtrans ? "+%" : "%"); } # append
		elsif (!$foundtrans) { $retstring = "@"; }  # no transition
		return $retstring;
	}
}

# Print regular expression for the finite automaton
# &test_print;
print "\n\nThe RE for the given FA is (% is empty string, @ is empty set):\n";
$output = &regexp($start_state, $final_states[0], $#states+1);
foreach $f (1 .. $#final_states) {
	if (($str = &regexp($start_state, $final_states[$f], $#states+1)) ne "@") {
		($output eq "@") ? ($output = "$str") : ($output .= "+$str"); } }
print "$output\n";
