#!/usr/bin/perl # File: fa2re # Author: Anand Natrajan # Contact: anand@virginia.edu, http://www.cs.virginia.edu/~an4m/ # Task: Convert an FA (finite automaton) to an RE (regular expression). # Algorithm from Introduction to Automata Theory, Languages and # Computation by Hopcroft & Ullman, Chapter 2, p33-35. # I/P and O/P: STDIN and STDOUT # Global variables @states = (); # set of states of FA @alphabet = (); # alphabet accepted by FA @transitions = (); # transitions made by FA $start_state = 1; # start state of FA @final_states = (); # set of final states of FA @allREs = ([[0]]); # 3D array storing REs thus far computed # Test if the candidate element belongs in the specified list sub belongs { $candidate = $_[0]; shift; foreach $element (@_) { if ($element eq $candidate) { return 1; } } return 0; } # Read in input file and validate FA foreach $line () { chop($line); $line =~ s/#.*//; # remove comments if ($line =~ /^\s*$/) { next; } # discard blank lines elsif ($line =~ /^states/) # parse states { ($tag, $info) = split(/:\s*/, $line); # ignore before colon $info =~ tr/,;/ /; # remove separators # states must be numbered uniquely and sorted $info =~ /[^ 0-9]/ and die "Non-numeric state number\n"; @states = sort {$a <=> $b} split(/\s+/, $info); } elsif ($line =~ /^alpha/) # parse alphabet { ($tag, $info) = split(/:\s*/, $line); # ignore before colon $info =~ tr/,;/ /; # remove separators # alphabet characters must be unique alphanumerics and sorted $info =~ /[^ a-zA-Z0-9%]/ and die "Non-alphanumeric character\n"; @alphabet = sort {$a cmp $b} split(/\s+/, $info); } elsif ($line =~ /^start/) # parse start state { ($tag, $start_state) = split(/:\s*/, $line); # ignore before colon # start state must belong to set of states &belongs($start_state, @states) or die "Start state invalid\n"; } elsif ($line =~ /^final/) # parse final states { ($tag, $info) = split(/:\s*/, $line); # ignore before colon $info =~ tr/,;/ /; # remove separators # states must be numbered uniquely and sorted @final_states = sort {$a <=> $b} split(/\s+/, $info); foreach $fin (@final_states) { &belongs($fin, @final_states) or die "Final state $fin invalid\n"; } } else # read in transitions { ($start, $char, $final) = split(/\s+/, $line); &belongs($start, @states) or die "Invalid start $start in $line\n"; &belongs($char, @alphabet) or die "Invalid char $char in $line\n"; &belongs($final, @states) or die "Invalid final $final in $line\n"; push @transitions, [$start, $char, $final]; } } # Test printing of FA sub test_print { $" = ','; print "States of the FA: @states\nAlphabet of the FA: @alphabet\n" . "Start state of the FA: $start_state\n" . "Final states of the FA: @final_states\n\nTransitions of the FA:\n"; for $i (0 .. $#transitions) { print "\tin $transitions[$i][0] on $transitions[$i][1] " . "go to $transitions[$i][2]\n"; } for ($k = 0; $k <= $#states+1; $k++) { for ($i = 0; $i <= $#states; $i++) { for ($j = 0; $j <= $#states; $j++) { print "$states[$i], $states[$j], $k: " . ®exp($states[$i], $states[$j], $k) . "\n"; } } } } # Make regular expression - the algorithm as a recursive function, unless # the RE required already has been computed earlier. If the latter case, # look up the RE from a 3D table. sub regexp { local $i = $_[0]; local $j = $_[1]; local $k = $_[2]; local $str1; local $str2; local $str3; local $str4; # Note call parameters carefully. $i and $j are actual state labels, # hence used as such. Originally, $i and $j are called with @states # components. $k is a number denoting how many states to go through # [Hopcroft & Ullman]. $k = 0 implies recursion termination. Maximally, # $k = $#states. When indexing with $k into @states (in order to get # appropriate state labels), account for arrays indexing with 0. if ($k > 0) { $str1 = defined($allREs[$i][$states[$k-1]][$k-1]) ? $allREs[$i][$states[$k-1]][$k-1] : ®exp($i, $states[$k-1], $k-1); $str2 = defined($allREs[$states[$k-1]][$states[$k-1]][$k-1]) ? $allREs[$states[$k-1]][$states[$k-1]][$k-1] : ®exp($states[$k-1], $states[$k-1], $k-1); $str3 = defined($allREs[$states[$k-1]][$j][$k-1]) ? $allREs[$states[$k-1]][$j][$k-1] : ®exp($states[$k-1], $j, $k-1); $str4 = defined($allREs[$i][$j][$k-1]) ? $allREs[$i][$j][$k-1] : ®exp($i, $j, $k-1); # We will use % as the blank string and @ as the empty set. # We perform a bunch of optimisations to the RE based on experience. # Intersection with an empty set is always empty. e.g., @^A == @ if ($str1 eq "@" or $str2 eq "@" or $str3 eq "@") { $str1 = $str2 = $str3 = ""; } # Remove superfluous parentheses in 4th term. e.g., (A) == A if ($str4 =~ /^([^)]*)$/) { $str4 =~ s/^$//; $str4 =~ s/$$//; } # Union with empty set is idempotent. e.g., A+@ == A if ($str4 eq "@") { $str4 = ""; } # If 1st or 3rd strings are blank, they are subsumed by 2nd. # e.g., %a% == a if ($str1 eq "%") { $str1 = ""; } if ($str3 eq "%") { $str3 = ""; } # 2nd string must be checked carefully against the others. if ($str2 ne "%" and $str2 ne "") { # If 2nd string is ORed with blank, blank discarded. # e.g., (a+%)* == a* if ($str2 =~ /\+%$/) { # Additionally, if either 1st or 3rd string match 2nd, they # can be discarded. e.g., (a+%)(a+%)*(a+%) == a* if ($str1 eq $str2) { $str1 = ""; } if ($str3 eq $str2) { $str3 = ""; } $str2 =~ s/\+%$//; # remove end +% } # If 2nd string is not ORed with blank, there's still a # chance of subsumption. e.g., (a+%)a*(a+%)+(a+%) == a* else { if ($str1 =~ /\+%$/) { $tmpstr = $str1; $tmpstr =~ s/\+%$//; if ($tmpstr eq $str2) { $str1 = ""; } } if ($str3 =~ /\+%$/) { $tmpstr = $str3; $tmpstr =~ s/\+%$//; if ($tmpstr eq $str2) { $str3 = ""; } } } if ($str1 eq "" and $str3 eq "") { $tmpstr = $str4; $tmpstr =~ s/\+%$//; if ($tmpstr eq $str2 or $str4 eq "%") { $str4 = ""; } } # Parenthetise 2nd string only if necessary. $str2 = length($str2) > 1 ? "($str2)*" : "$str2*"; } # Blank out 2nd string only if either 1st or 3rd is non-blank. elsif ($str1 ne "" or $str3 ne "") { $str2 = ""; } # If 1st and 3rd string concatenate up to 4th string, blank 4th. # e.g., a(x)*b+ab = a(x)*b if ($str1.$str3 eq $str4) { $str4 = ""}; # Parenthetise 1st and 3rd string only if necessary. if ($str1 =~ /\+/ and ($str2 ne "" or $str3 ne "")) { $str1 = "($str1)"; } if ($str3 =~ /\+/ and ($str2 ne "" or $str1 ne "")) { $str3 = "($str3)"; } # If 4th string matches concatenation of first 3, blank 4th. Put # the OR only if necessary and check if returning empty set. if ($str1.$str2.$str3 eq $str4) { $str4 = ""; } if ($str4 ne "" and $str1.$str2.$str3 ne "") { $str4 =~ s/^/+/; } return "$str1$str2$str3$str4" eq "" ? "@" : "$str1$str2$str3$str4"; } else { $foundtrans = 0; $retstring = ""; foreach $transref (@transitions) { # does a transition exist? if ($i == $transref->[0] and $j == $transref->[2]) { $foundtrans = 1 and ($retstring ne "") ? ($retstring .= "+$transref->[1]") : ($retstring = "$transref->[1]"); } } if ($i == $j) { $retstring .= ($foundtrans ? "+%" : "%"); } # append elsif (!$foundtrans) { $retstring = "@"; } # no transition return $retstring; } } # Print regular expression for the finite automaton # &test_print; print "\n\nThe RE for the given FA is (% is empty string, @ is empty set):\n"; $output = ®exp($start_state, $final_states[0], $#states+1); foreach $f (1 .. $#final_states) { if (($str = ®exp($start_state, $final_states[$f], $#states+1)) ne "@") { ($output eq "@") ? ($output = "$str") : ($output .= "+$str"); } } print "$output\n";