#!/usr/bin/env perl # # csvtrans - transform CSV files, interpreted as directed graphs # # $Id$ use warnings; use strict; use Getopt::Std; use Text::CSV_XS; use List::Util qw(first); my %opt; getopts( 'h?b:f:e:pq:s:t:u:x:F:X:', \%opt ); HELP_MESSAGE() if grep { /^[h?]/ } keys %opt; sub oomph { warn join( ' ', "fatal error:", @_ ); die "for help: $0 -h\n"; } my $sep = $opt{'t'} // ','; my $outsep = $opt{'u'} // $sep; my $quote = $opt{'q'} // ''; my $print_paths = $opt{'p'}; my @operation = grep { /\S/ } split( /\s*;+\s*/, $opt{'e'} // '' ); # correct for any ;s within regexps - note: make a real parser my $undel_matchop_rx = qr'^m([/][^/]*|#[^#]*)$'; # an undelimited regexp op for ( my $i = 0 ; $i < $#operation ; ++$i ) { if ( $operation[$i] =~ $undel_matchop_rx ) { splice( @operation, $i, 2, $operation[$i] . $operation[ $i + 1 ] ); --$i; } } my $csvr = new Text::CSV_XS( { sep_char => $sep, eol => $/, quote_space => 0, quote_char => $quote } ) or oomph("the CSV parser doesn't work (not installed?)"); my $csvw = new Text::CSV_XS( { sep_char => $outsep, eol => $/, quote_space => 0, quote_char => $quote } ) or oomph("the CSV parser doesn't work (not installed?)"); # TO DO: turn the following options into operators, # after introducing support for multiple graphs my %optgraph; foreach my $o ( grep { defined $opt{$_} } ( 'b', 'f', 'x', 's', 'F', 'X' ) ) { if ( ( $opt{$o} ne '-' ) && !-r $opt{$o} ) { oomph("-$o argument, $opt{$o}, doesn't specify a readable input file"); } $optgraph{$o} = [ graph_in_rows( rows_of_csv( $opt{$o} ) ) ]; } @ARGV or @ARGV = ('-'); #warn "input files: ", join(', ', @ARGV), "\n"; print_graph( read_and_transform(@ARGV) ); exit(0); # #--- finis - incipiunt auxilia ----# # sub read_and_transform { # we have the directed graphs given on the command line my @nodes; # lists the nodes in a particular order; # initially, in the order they appeared in the input my %is_node; # maps each node (including isolated ones) to 1 my %src2tgt21; # maps each arc source to a mapping of arc targets to 1 # (duplicate arcs aren't supported at the moment) do { my @rows = map { @$_ } map { rows_of_csv($_) } @_; my ( $nodes, $src2tgt21 ) = graph_in_rows( \@rows ); @nodes = @$nodes; %src2tgt21 = %$src2tgt21; %is_node = map { $_ => 1 } @nodes; }; # first apply -b, -s, -f, and -x if ( $optgraph{b} ) { my $lift = $optgraph{b}->[1]; %src2tgt21 = %{ non0_concat( revert($lift), \%src2tgt21, $lift ) }; my %in_result = endpoint21( \%src2tgt21 ); # we don't want any additional nodes @nodes = grep { $in_result{$_} } @{ $optgraph{b}->[0] }; %is_node = map { $_ => 1 } @nodes; } if ( $optgraph{s} ) { my ( $src2tgt21, $nodes ) = concat2( \@nodes, \%src2tgt21, @{ $optgraph{s} } ); %src2tgt21 = %$src2tgt21; @nodes = @$nodes; %is_node = map { $_ => 1 } @nodes; } if ( $optgraph{x} ) { remove_nodes( \%src2tgt21, \%is_node, @{ $optgraph{x}->[0] } ); } if ( $optgraph{f} ) { my %in_f = map { $_ => 1 } @{ $optgraph{f}->[0] }; #warn( scalar( keys %in_f ), "filter nodes\n" ); remove_nodes( \%src2tgt21, \%is_node, grep { !defined $in_f{$_} } keys %is_node ); } if ( $optgraph{X} ) { # add the 'characteristic function' for the complement of -X my %is_X_node = map { $_ => 1 } @{ $optgraph{X}->[0] }; foreach my $node ( keys(%is_node) ) { $src2tgt21{$node}->{ 0 + defined( $is_X_node{$node} ) } = 1; } } if ( $optgraph{F} ) { # add the 'characteristic function' for -F my %is_F_node = map { $_ => 1 } @{ $optgraph{F}->[0] }; foreach my $node ( keys(%is_node) ) { my $occurs = 0 + !defined( $is_F_node{$node} ); $is_node{$occurs} = 1; $src2tgt21{$node}->{$occurs} = 1; } } # then apply -e my %cmp = ( 'eq' => sub { $_[0] eq $_[1] }, 'ge' => sub { $_[0] ge $_[1] }, 'gt' => sub { $_[0] gt $_[1] }, 'le' => sub { $_[0] le $_[1] }, 'lt' => sub { $_[0] lt $_[1] }, 'ne' => sub { $_[0] ne $_[1] } ); # hack: add case insensitive comparisons foreach my $cmp ( keys %cmp ) { $cmp{"i$cmp"} = sub { $cmp{$cmp}( map { lc } @_ ); } } foreach my $op (@operation) { if ( $op eq 'chain' ) { my %src2prevtgt; my %new_src2tgt21; # go through the arcs, sorted by node order my $node_cmp = by_orig_order( \@nodes, \%is_node ); foreach my $src ( sort $node_cmp keys %src2tgt21 ) { foreach my $tgt ( sort $node_cmp keys %{ $src2tgt21{$src} } ) { my $newsrc = $src2prevtgt{$src} // $src; $new_src2tgt21{$newsrc}->{$tgt} = 1; $src2prevtgt{$src} = $tgt; } } %src2tgt21 = %new_src2tgt21; # the list of nodes doesn't change } elsif ( $op eq 'square' ) { %src2tgt21 = %{ squared( \%src2tgt21 ) }; } elsif ( $op eq 'close' ) { # replace the directed graph with its transitive closure %src2tgt21 = %{ trans_close( \%src2tgt21 ) }; } elsif ( $op eq 'cycles' ) { # remove all arcs that are not part of any cycles %src2tgt21 = %{ cycles_only( \%src2tgt21 ) }; } elsif ( $op eq 'noncycles' ) { # remove all arcs that are part of cycles remove_arcs( \%src2tgt21, cycles_only( \%src2tgt21 ) ); } elsif ( $op eq 'cyclify' ) { # replace all connected components with cycles on their nodes my $new = cycle_graph( \%src2tgt21 ); remove_arcs( \%src2tgt21, cycles_only( \%src2tgt21 ) ); add_arcs( \%src2tgt21, $new ); } elsif ( $op eq 'petalify' ) { # replace all connected components with arc pairs on their nodes my $new = petal_graph( \%src2tgt21 ); remove_arcs( \%src2tgt21, cycles_only( \%src2tgt21 ) ); add_arcs( \%src2tgt21, $new ); } elsif ( $op eq 'contract' ) { # reduce all cycles in the graph to a single node my %rest2first = map { my ( $first, @rest ) = @$_; map { $_ => $first } @rest; } grep { @$_ > 1 } @{ sorted_cycles( \%src2tgt21 ) }; add_arcs( \%src2tgt21, substituted_arcs( \%src2tgt21, 'e', \%rest2first ) ); remove_nodes( \%src2tgt21, \%is_node, keys %rest2first ) ; # QQQ swap statements? } elsif ( $op eq 'tsort' ) { @nodes = @{ isolated_nodes( \%src2tgt21, \%is_node ), non0_topsort( \%src2tgt21 ) }; } elsif ( $op eq 'reduce' ) { %src2tgt21 = %{ trans_reduce( \%src2tgt21 ) }; } elsif ( $op eq 'rmarcs' ) { # remove all arcs %src2tgt21 = (); } elsif ( $op eq 'rev' ) { # reverse the graph %src2tgt21 = %{ revert( \%src2tgt21 ) }; # the nodes stay the same } elsif ( $op eq 'sym' ) { # add the reverse graph while ( my ( $src, $tgt21 ) = each %src2tgt21 ) { foreach my $tgt ( keys %$tgt21 ) { $src2tgt21{$tgt}->{$src} = 1; } } # the nodes stay the same } elsif ( $op eq 'tgtcmp' ) { # replace the directed graph with the one obtained by # comparing the target nodes per source node my %new_src2tgt21; foreach my $src1 ( keys %src2tgt21 ) { foreach my $src2 ( keys %src2tgt21 ) { if ( !grep { !defined $src2tgt21{$src2}->{$_} } keys %{ $src2tgt21{$src1} } ) { # all of $src1's targets are also targets of $src2 # ($src2 includes $src1, when both are regarded as sets of targets) $new_src2tgt21{$src2}->{$src1} = 1; } } } %src2tgt21 = %new_src2tgt21; # this result is reflexive, so the nodes stay the same } elsif ( defined( my $cmpop = first { index( $op, $_ ) == 0 } keys %cmp ) ) { my ( $cmper, $premodifier ) = ( $cmp{$cmpop}, ( $cmpop ne $op ? substr( $op, length($cmpop) + 1 ) : undef ) ); # retain only the arcs $a->$b for which $a $op $b if ( !defined($premodifier) || $premodifier eq '' ) { # no premodification requested %src2tgt21 = %{ filtered_arcs( \%src2tgt21, $cmper ) }; } elsif ( my ( $ld, $rx, $rd, $mods ) = $premodifier =~ m'^s([\/#])(.*)([^msixpogce])([msixpogce]*)$' ) { if ( my ( $crit_rx, $repl, $use_g ) = match_or_subst( 's', $ld, $rx, $rd, $mods ) ) { my %subst = node_substitution( \%is_node, $crit_rx, $repl, $use_g ); # apply the premodification before comparing %src2tgt21 = %{ filtered_arcs( \%src2tgt21, sub { &$cmper( map { $subst{$_} } @_ ); } ) }; #warn "filtering leaves ", scalar( keys %src2tgt21 ), " arcs\n"; } else { warn "uninterpretable pre-substitution, ignoring filter operation: $op\n"; } } else { warn "malformed pre-substitution, ignoring filter operation: $op\n"; } # note that no nodes are removed } elsif ( $op eq 'addid' ) { # create arcs $node -> $node for every node $src2tgt21{$_}->{$_} = 1 for keys %is_node; # the set of nodes stays the same } elsif ( grep { $op eq $_ } ( 'in', 'out', 'io' ) ) { # replace all arcs with the in-, out- or total degree of each node my ( $outdegree, $indegree ) = all_degrees( \%src2tgt21, \%is_node ); my %degree = $op eq 'in' ? %$indegree : $op eq 'out' ? %$outdegree : map { $_ => $indegree->{$_} + $outdegree->{$_} } keys %is_node; my %new_src2tgt; while ( my ( $n, $d ) = each %degree ) { $is_node{$d} = 1; $new_src2tgt{$n}->{$d} = 1; } %src2tgt21 = %new_src2tgt; } elsif ( my ( $ends, $sep, $neg, $crit, $dir, $cmp, $n, $mors, $ld, $rx, $rd, $mods, $body ) = $op =~ m'^(l|r|b|e|)(:?)(!|)((in|out|io)(<|=|>)(\d+)|(m|s)([\/#])(.*)([^msixpogce])([msixpogce]*)|{(?(?:[^{}]*+|{(?&G)})*)})$' ) { # without $ends, retain only nodes that pass the given criterion # with $ends, retain only arcs whose end(s) pass the given criterion if ( defined $body ) { # {body} is a shorthand for s/.*/$body/ except with {} delimiters ( $mors, $ld, $rx, $rd, $mods ) = ( 's', '}', ".*}$body", '}', '' ); } # first, determine the nodes that do *not* pass the given criterion my @failing_nodes; # or in case of a substitution, which substitutions to make my %subst; if ($dir) { # $crit is a degree criterion # compute the indegrees and outdegrees of all nodes my $passes = $cmp eq '<' ? sub { $_[0] < $n } : $cmp eq '=' ? sub { $_[0] == $n } : sub { $_[0] > $n }; my $in = ( $dir ne 'out' ); my $out = ( $dir ne 'in' ); my ( $outdegree, $indegree ) = non0_degrees( \%src2tgt21 ); @failing_nodes = grep { my ( $in, $out ) = ( $indegree->{$_} // 0, $outdegree->{$_} // 0 ); #warn "indegree($_) = $in\n"; #warn "outdegree($_) = $out\n"; my $passes = ( $dir eq 'in' && !&$passes($in) ) || ( $dir eq 'out' && !&$passes($out) ) || ( $dir eq 'io' && !&$passes( $in + $out ) ); $neg xor $passes # if $neg is '!', reverse the condition } keys %is_node; } elsif ( $neg && $mors eq 's' ) { warn "negated substitution is nonsensical, treated as no-op: $op\n"; } else { # $crit is a regex operation # with contents $rx and delimiters $ld and $rd my ( $crit_rx, $repl, $use_g ) = match_or_subst( $mors, $ld, $rx, $rd, $mods ); if ($crit_rx) { if ( $mors eq 'm' ) { warn "g ignored, doesn't make sense in match operation $op\n" if $use_g; @failing_nodes = grep { $neg xor !/$crit_rx/ } keys %is_node; } else { %subst = node_substitution( \%is_node, $crit_rx, $repl, $use_g ); } } else { my $match = ( $mors eq 'm' ) ? 'match' : 'substitution'; warn "$match operation with invalid pattern treated as no-op: $op\n"; } } $mors = 'm' if !defined $mors; if ( $ends eq '' ) { if ( $mors eq 'm' ) { # remove all nodes that do *not* meet the given criterion remove_nodes( \%src2tgt21, \%is_node, @failing_nodes ); } else { # perform the substitution on all nodes (and arc endpoints) %src2tgt21 = %{ substituted_arcs( \%src2tgt21, $ends, \%subst ) }; %is_node = map { ( defined $subst{$_} ? $subst{$_} : $_ ) => 1 } keys %is_node; } } elsif ( $mors eq 'm' ) { # remove all arcs of which the indicated end point(s) # do(es) *not* meet the given criterion my %fails = map { $_ => 1 } @failing_nodes; while ( my ( $src, $tgt21 ) = each %src2tgt21 ) { if ( $fails{$src} && $ends =~ /l|b/ ) { #warn "removing $src -> *\n"; delete $src2tgt21{$src}; } else { foreach my $tgt ( keys %$tgt21 ) { if ( $fails{$tgt} && ( $ends =~ /r|b/ || ( $ends =~ /e/ && $fails{$src} ) ) ) { #warn "removing $src -> $tgt\n"; delete $tgt21->{$tgt}; delete $src2tgt21{$src}->{$tgt}; } } if ( !%$tgt21 ) { #warn "removed all arcs for $src\n"; delete $src2tgt21{$src}; } } } } else { if ( $ends eq 'b' ) { #warn "interpreting b as e in the substitution command $op\n"; } # perform the substitution on all indicated arc end point(s), # but if only on one end, don't remove any nodes on the other end # first, compute the new arcs my $new_src2tgt21 = substituted_arcs( \%src2tgt21, $ends, \%subst ); # update the arcs, but also update the set of nodes: # a node disappears if it no longer occurs as an end point # and appears if it didn't yet occur as an end point my %oldnonendpoint21 = map { $_ => 1 } isolated_nodes( \%src2tgt21, \%is_node ); %src2tgt21 = %$new_src2tgt21; %is_node = ( %oldnonendpoint21, endpoint21($new_src2tgt21) ); #warn(" the nodes are now: ",join(', ', keys %is_node), "\n"); #warn "the new arcs are now:\n"; dump_arcs(\%src2tgt21); } } else { warn "unrecognized operation treated as no-op: $op\n"; } } ( \@nodes, \%is_node, \%src2tgt21 ) } sub non0_topsort # yields a reference to the list of arc endpoints in topological sort order { my ($src2tgt21) = @_; my %rest2first = map { my ( $first, @rest ) = @$_; map { $_ => $first } @rest; } grep { @$_ > 1 } @{ sorted_cycles($src2tgt21) }; my ( $out, $in ) = non0_degrees($src2tgt21); my @current = grep { !defined $in->{$_} } keys %$out; my @topsorted; while ( my $src = shift(@current) ) { push( @topsorted, $src ); foreach my $tgt ( keys %{ $src2tgt21->{$src} } ) { if ( !--$in->{$tgt} ) { push( @current, $tgt ); } } } [@topsorted]; } sub match_or_subst # parses and interprets, but does not execute, a regexp-using operation { my ( $mors, $ld, $rx, $rd, $mods ) = @_; my $op = "$ld$rx$ld$rd$mods"; my $repl; if ( $mors eq 's' ) { # it is a substitution operation, so $rx must contain # a delimiter, followed by the (possibly empty) replacement if ( ( my $delpos = index( $rx, $ld ) ) >= 0 ) { ( $rx, $repl ) = ( substr( $rx, 0, $delpos ), substr( $rx, $delpos + 1 ) ); } else { warn "assuming an empty replacement in substitution operation $op\n"; $repl = ''; } if ( $ld ne $rd ) { # cross fingers - perhaps the closing delimiter was omitted warn "adding closing delimiter on substitution operation $op\n"; $repl .= $rd; # equivalent, but harder to formulate } } elsif ( $ld ne $rd ) { # cross fingers - perhaps the closing delimiter was omitted warn "adding closing delimiter on match operation $op\n"; $rx .= $rd; # equivalent, but harder to formulate } my $use_g = 0; if ($mods) { # we can't just suffix the operation with $mods, unfortunately my $non_g_mods = $mods; $use_g = ( $non_g_mods =~ s/g//g ); $rx = "(?$non_g_mods)$rx" if $non_g_mods; } ( eval { $ld eq '/' ? qr/$rx/ : qr#$rx# }, $repl, $use_g ); } sub node_substitution # yields the substitution s/$crit_rx/$repl/g? # on the given nodes, returning the result as a function (hash) { my ( $is_node, $crit_rx, $repl, $use_g ) = @_; map { my $orig = $_; if ($use_g) { s/$crit_rx/qq(qq($repl))/eeg } # thank you, else { s/$crit_rx/qq(qq($repl))/ee; } # anno@#perl ( $orig, $_ ) } keys %$is_node; } sub print_graph # print the directed graph, with the nodes in original order # (and any non-original nodes following in arbitrary order); # this is important if we want to compare results with originals # using tools that depend on order, such as diff { my ( $oldnodes, $is_node, $src2tgt21 ) = @_; # list all nodes, with the original ones first, in original order my $node_cmp = by_orig_order( $oldnodes, $is_node ); # print the isolated nodes in node order print_rows( map { [$_] } sort $node_cmp isolated_nodes( $src2tgt21, $is_node ) ); if ( !$print_paths ) # no -p { # print the arcs, sorted by node order foreach my $src ( sort $node_cmp keys %$src2tgt21 ) { print_rows( map { [ $src, $_ ] } sort $node_cmp keys %{ $src2tgt21->{$src} } ); } } else { # remove all cycle arcs my $wo_cycles = copy($src2tgt21); remove_arcs( $wo_cycles, cycles_only($src2tgt21) ); # compute all maximal paths # we can do it in one pass if we visit the nodes in the right order my %src2paths; # each value is an arrayref of arrayrefs, each a path my %initial21; # is this node at the start of a path? my @bottomsorted = reverse @{ non0_topsort($wo_cycles) }; #warn "bottomsorted nodes: @bottomsorted\n"; foreach my $src (@bottomsorted) { $initial21{$src} = 1; if ( defined( $wo_cycles->{$src} ) ) { my @tgts = sort $node_cmp keys %{ $wo_cycles->{$src} }; #warn "$src has ", scalar(@tgts), " children: @tgts\n"; my @tgtpaths = @tgts ? map { $src2paths{$_} } @tgts : [ [] ]; @tgtpaths = map { @$_ } @tgtpaths; #warn "$src has ", scalar(@tgtpaths), " continuation paths\n"; $src2paths{$src} = [ map { [ $src, @$_ ] } @tgtpaths ]; delete $initial21{$_} for @tgts; } else { $src2paths{$src} = [ [$src] ]; warn "paths for leaf $src: ", @{ $src2paths{$src} }, "\n"; } } #warn "initial: $_\n" for sort $node_cmp keys %initial21; print_rows( map { @{ $src2paths{$_} } } sort $node_cmp keys %initial21 ); } } sub by_orig_order # orders all nodes stably (that is why it adds the new ones first) { my ( $oldnodes, $is_node ) = @_; my $last = $#$oldnodes; my %node2idx = map { $oldnodes->[$_] => $_ } 0 .. $last; my @newnodes = grep { !defined $node2idx{$_} } keys %$is_node; $node2idx{$_} = ++$last for @newnodes; sub { $node2idx{$a} <=> $node2idx{$b} }; } sub HELP_MESSAGE { print STDERR < d for each node n, where d is n's io indegree, outdegree, or indegree+outdegree, respectively cmp retains only the arcs \$a -> \$b for which \$a cmp \$b cmp:subst retains only the arcs \$a -> \$b for which sa cmp sb where sa, sb are the result of applying substitution subst to \$a and \$b, respectively where cmp is one of Perl's string comparison operators eq, ne, gt, ge, lt, le or, for case-insensitive comparison, the same preceded with i: ieq, ine, igt, ige, ilt, ile and subst is a Perl substitution command, e.g. s#.(.).*#\$1# (see details below); so ieq is a shorthand for eq:s#.*#\L\$&#, etcetera crit retains only the nodes that meet crit l:crit retains only the arcs whose source meets crit r:crit retains only the arcs whose target meets crit e:crit retains only the arcs whose source or target meets crit b:crit retains only the arcs whose source and target meets crit where crit is one of: in=0, in>0, in=1, in>1, in<2, in=2, in>2, ... a constraint on the node's number of incoming arcs out=0, out>0, out=1, ... a constraint on the node's number of outgoing arcs io=0, io>0, io=1, ... a constraint on the node's total number of arcs m/regexp/msixpoce, m#regexp#msixpoce a constraint on the node name; regexp is a Perl regular expression that does not contain its delimiter (/ or #); msixpoce is zero of more of the characters in msixpoge, and serves as a (Perl) matching modifier and crit may be prefixed with ! to negate it; subst modifies nodes l:subst modifies arc sources r:subst modifies arc targets e:subst modifies arc sources and targets where subst is one of: s/regexp/repl/msixpogce, s#regexp#repl/msixpogce a (Perl) substitution applied to the node name; regexp and msixpogce are as before, except that g is allowed; repl is the substituted value; or {repl} which is a shorthand for s/.*/repl/; if arcs are being modified, four strings have special meaning in repl: %n stands for the arc endpoint being modified %o stands for the other endpoint %l stands for the arc source %r stands for the arc target Example: e:s/.*/%o/g e:{%o} are both equivalent to rev ZZ exit(0); } sub rows_of_csv { my @rows; my ($f) = @_; my $infh; if ( $f eq '-' ) { $infh = *STDIN; } elsif ( !open( $infh, '<', $f ) ) { warn "warning: cannot read from $f\n"; } while ( my $row = $csvr->getline($infh) ) { push( @rows, $row ); } #warn "read ", scalar(@rows), " rows from $f\n"; \@rows; } sub print_rows { foreach my $row (@_) { $csvw->print( *STDOUT, $row ); } } sub graph_in_rows { my ( @nodes, %is_node, %src2tgt21 ); foreach my $row ( @{ $_[0] } ) { foreach my $node (@$row) { if ( !defined $is_node{$node} ) { push( @nodes, $node ); $is_node{$node} = 1; #warn "added node $node\n"; } } if ( @$row > 1 ) { # 2 or more rows - a path foreach my $arc ( pairs(@$row) ) { # an arc on the path $src2tgt21{ $arc->[0] }->{ $arc->[1] } = 1; #warn "added arc $arc->[0] -> $arc->[1] \n"; } } } ( \@nodes, \%src2tgt21 ); } sub pairs { #warn 'pairs(', join(', ', @_), ")\n"; if ( !@_ ) { (); } elsif ( @_ == 1 ) { ( [] ); } else { map { [ $_[$_], $_[ $_ + 1 ] ] } 0 .. $#_ - 1; } } sub petals { #warn 'petals(', join(', ', @_), ")\n"; if ( !@_ ) { (); } elsif ( @_ == 1 ) { ( [] ); } else { map { [ $_[0], $_[$_] ], [ $_[$_], $_[0] ] } 1 .. $#_; } } sub remove_nodes { my ( $src2tgt21, $is_node ) = @_; delete $is_node->{$_} for @_; while ( my ( $src, $tgt21 ) = each %$src2tgt21 ) { if ( !$is_node->{$src} ) { delete $src2tgt21->{$src}; } else { foreach my $tgt ( keys %$tgt21 ) { if ( !$is_node->{$tgt} ) { delete $tgt21->{$tgt}; } } delete $src2tgt21->{$src} if !%$tgt21; } } } sub add_arcs { my ( $src2tgt21, $new_src2tgt ) = @_; while ( my ( $src, $n_tgt21 ) = each %$new_src2tgt ) { foreach my $tgt ( keys %$n_tgt21 ) { #warn "adding $src->$tgt\n" if !defined $src2tgt21->{$src}->{$tgt}; $src2tgt21->{$src}->{$tgt} = 1; } } } sub remove_arcs { my ( $src2tgt21, $doomed_src2tgt ) = @_; while ( my ( $src, $n_tgt21 ) = each %$doomed_src2tgt ) { my $tgt21 = $src2tgt21->{$src}; delete $tgt21->{$_} for keys %$n_tgt21; delete $src2tgt21->{$src} if !%$tgt21; } } sub has_arc { my ( $src2tgt21, $src, $tgt ) = @_; defined $src2tgt21->{$src}->{$tgt}; } sub arcs_for_which { my ( $src2tgt21, $arcprop ) = @_; my %new_src2tgt; while ( my ( $src, $tgt21 ) = each %$src2tgt21 ) { foreach my $tgt ( keys %$tgt21 ) { if ( &$arcprop( $src, $tgt ) ) { ++$new_src2tgt{$src}->{$tgt} if !$new_src2tgt{$src}->{$tgt}; } } } \%new_src2tgt; } sub pairs2arcs { my %src2tgt21; foreach my $p (@_) { my ( $src, $tgt ) = @$p; $src2tgt21{$src}->{$tgt} = 1; } \%src2tgt21; } sub non0_degrees # does not include degrees of 0 # argument: \%src2tgt21 { my ( %outdegree, %indegree ); while ( my ( $src, $tgt21 ) = each %{ $_[0] } ) { foreach my $tgt ( keys %$tgt21 ) { ++$outdegree{$src}; ++$indegree{$tgt}; } } ( \%outdegree, \%indegree ); } sub all_degrees # does include degrees of 0 # arguments: \%src2tgt21, \%is_node { my $is_node = $_[1]; die "BUG: missing or wrong 2nd argument to all_degrees()\n" if !defined($is_node) || ref $is_node ne 'HASH'; my ( $outdegree, $indegree ) = non0_degrees( $_[0] ); $outdegree->{$_} = 0 for grep { !defined $outdegree->{$_} } keys %$is_node; $indegree->{$_} = 0 for grep { !defined $indegree->{$_} } keys %$is_node; ( ( $outdegree, $indegree ) ); } sub srcs_and_tgts { map { [ keys %$_ ] } non0_degrees( $_[0] ); } sub srcs_not_tgts { my ( $out, $in ) = non0_degrees( $_[0] ); [ grep { !defined $in->{$_} } keys %$out ]; } sub targets { my ( $out, $in ) = non0_degrees( $_[0] ); [ keys %$in ]; } sub endpoint21 # maps each node that is an arc source or target to 1 { map { $_ => 1 } map { keys %$_ } non0_degrees( $_[0] ); } sub endpoints # lists the nodes that are an arc source and/or target { my %ep21 = endpoint21( $_[0] ); keys %ep21; } sub isolated_nodes # lists the nodes that are no arc source or target { my ( $src2tgt21, $is_node ) = @_; my %ep21 = endpoint21($src2tgt21); grep { !defined $ep21{$_} } keys %$is_node; } sub filtered_arcs # lists the arcs that satisfy the given arc predicate { my ( $src2tgt21, $arc_passes ) = @_; my %new_src2tgt21; while ( my ( $src, $tgt21 ) = each %$src2tgt21 ) { foreach my $tgt ( keys %{ $src2tgt21->{$src} } ) { if ( &$arc_passes( $src, $tgt ) ) { $new_src2tgt21{$src}->{$tgt} = 1; #warn "arc_filter($src->$tgt) passes\n"; } } } \%new_src2tgt21; } sub substituted_arcs # computes the substitute arcs according to # + the endpoint specification $end; # + the *node* substitution function %$subst # ('l' for sources only, 'r' for targets only, 'e' for both ends) # if $subst->{$src} or $subst->{$tgt} is undef, the arc in question is omitted, # but if it does not *exist*, the end point in question remains unchanged { my ( $src2tgt21, $ends, $subst ) = @_; my %new_src2tgt21; #warn "substituting (at $ends):\n", map { "$_ -> $subst->{$_}\n" } sort keys %$subst; while ( my ( $src, $tgt21 ) = each %$src2tgt21 ) { #warn "within arcs:\n", map { "$src -> $_\n" } sort keys %$tgt21; my $newsrc = $src; if ( $ends ne 'r' && exists $subst->{$src} ) { #warn "substituting $src -> $subst->{$src}\n"; $newsrc = $subst->{$src}; next if !defined $newsrc; } foreach my $tgt ( keys %{ $src2tgt21->{$src} } ) { my $newtgt = $tgt; if ( $ends ne 'l' && exists $subst->{$tgt} ) { #warn "substituting $tgt -> $subst->{$tgt}\n"; $newtgt = $subst->{$tgt}; } next if !defined $newtgt; # only now, substitute %l and %r in $newsrc and $newtgt my $newersrc = join( $src, split( /%[ln]/, $newsrc, -1 ) ); $newtgt = join( $tgt, split( /%[rn]/, $newtgt, -1 ) ); $newersrc = join( $tgt, split( /%[ro]/, $newersrc, -1 ) ); $newtgt = join( $src, split( /%[lo]/, $newtgt, -1 ) ); #warn "substituting $src -> $tgt with $newersrc -> $newtgt\n"; $new_src2tgt21{$newersrc}->{$newtgt} = 1; } } #warn "arcs after substitution:\n", map { my $src = $_; map { "$src -> $_\n" } sort keys %{$new_src2tgt21{$src}} } sort keys %new_src2tgt21; \%new_src2tgt21; } sub copy # %$src2tgt21 is not a *deep* copy of $src2tgt21 { my ($src2tgt21) = @_; my %copy; while ( my ( $src, $tgt21 ) = each %$src2tgt21 ) { $copy{$src} = {%$tgt21}; # a copy } \%copy; } sub revert { my ($src2tgt21) = @_; my %tgt2src21; while ( my ( $src, $tgt21 ) = each %$src2tgt21 ) { foreach my $tgt ( keys %$tgt21 ) { $tgt2src21{$tgt}->{$src} = 1; } } \%tgt2src21; } sub squared_arcs # for a relation R, yields R^2 { non0_concat2( $_[0], $_[0] ); } sub non0_concat2 # for relations R, S, yields R.S, only on arcs { my ( $src2mid21, $mid2tgt21 ) = @_; my %src2tgt21; while ( my ( $src, $mid21 ) = each %$src2mid21 ) { foreach my $mid ( keys %$mid21 ) { next if !defined $mid2tgt21->{$mid}; foreach my $tgt ( keys %{ $mid2tgt21->{$mid} } ) { $src2tgt21{$src}->{$tgt} = 1; #warn "$src -> $mid -> $tgt\n"; } } } \%src2tgt21; } sub non0_concat # for relations R1, ..., Rn, yields R1 . ... . Rn, only on arcs { my $r1 = shift(@_) // die "BUG: argumentless call to concat()\n"; if (@_) { non0_concat2( $r1, non0_concat(@_) ); } else { $r1; } } sub concat2 # non0_concat2 and intersect the isolated nodes { my ( $nodes1, $src2mid21, $nodes2, $mid2tgt21 ) = @_; my %ep1 = endpoint21($src2mid21); my %ep2 = endpoint21($mid2tgt21); my $src2tgt21 = non0_concat2( $src2mid21, $mid2tgt21 ); my %ep3 = endpoint21($src2tgt21); my %is_node1 = map { $_ => 1 } @$nodes1; my @nodes2not1 = grep { !$is_node1{$_} } @$nodes2; my %isol1 = map { $_ => 1 } grep { !defined $ep1{$_} } @$nodes1; my %isol2 = map { $_ => 2 } grep { !defined $ep2{$_} } @$nodes2; my @nodes2keep = grep { $ep3{$_} || ( $isol1{$_} && $isol2{$_} ) } ( @$nodes1, @nodes2not1 ); ( non0_concat2( $src2mid21, $mid2tgt21 ), [@nodes2keep] ); } sub nodes_to_selfloops # given a list of nodes, yields a graph with selfloops { my $ref = { map { $_ => { $_ => 1 } } @_ }; # the assignment is to force returning a hashref (!!) } sub nodes_with_selfloops { my ($src2tgt21) = @_; grep { $src2tgt21->{$_}->{$_} } keys %$src2tgt21; } sub without_selfloops { my $rs_src2tgt21 = copy( $_[0] ); delete $rs_src2tgt21->{$_}->{$_} for keys %$rs_src2tgt21; $rs_src2tgt21; } sub trans_close # for a relation R, yields R^+ =D= R u R^2 u R^3 u ... { #my %src2tgt21 = %{ $_[0] }; # not a deep copy! my $src2tgt21 = copy( $_[0] ); my $tgt2src21 = revert( $_[0] ); foreach my $mid ( keys %$src2tgt21 ) { foreach my $src ( keys %{ $tgt2src21->{$mid} } ) { foreach my $tgt ( keys %{ $src2tgt21->{$mid} } ) { $src2tgt21->{$src}->{$tgt} = 1; $tgt2src21->{$tgt}->{$src} = 1; #warn "$src -> $mid -> $tgt\n"; } } } $src2tgt21; } sub trans_refl_close # for a relation R, yields R^* =D= I u R^ u R^2 u R^3 u ... { my ( $src2tgt21, $is_node ) = @_; my $trcl = trans_close($src2tgt21); $trcl->{$_}->{$_} = 1 for keys %$is_node; $trcl; } sub trans_reduce # for a relation R, yields its smallest subset that # - has the same transitive closure # - has the exact same cycles # so reduce the acyclic part of the graph; don't touch the cycles # this took a LONG time to get right, and I'm not even sure if it is # input graph used for testing: #a,b,c,d,e,f,b,g,h #a,h #i,d,b { my ($src2tgt21) = @_; # follow TRANSITIVE REDUCTION OF A DIRECTED GRAPH (Aho, Garey & Ullman) # which gives an algorithm on p. 136 #my $wo_sl = without_selfloops( $src2tgt21 ); #my $redundancy = squared( $wo_sl ); # except that that algorithm is flawed (try inputting a,b,c,d; a,d) #my $redundancy = non0_concat2( $wo_sl, trans_close( $wo_sl ) ); # and that we treat all arcs on cycles as having length 0 and leave them in # there may be a more efficient implementation ... my $zerosteps = cycles_only($src2tgt21); add_arcs( $zerosteps, nodes_to_selfloops( endpoints($src2tgt21) ) ); my $wo_cycles = copy($src2tgt21); remove_arcs( $wo_cycles, $zerosteps ); my $onestep = non0_concat2( $zerosteps, $wo_cycles ); add_arcs( $onestep, $wo_cycles ); add_arcs( $onestep, non0_concat2( $wo_cycles, $zerosteps ) ); my $redundancy = non0_concat2( $onestep, trans_close($onestep) ); my $reduced = copy($src2tgt21); remove_arcs( $reduced, $redundancy ); $reduced; } sub dump_arcs # for debugging { my ($src2tgt21) = @_; foreach my $src ( sort keys %$src2tgt21 ) { foreach my $tgt ( sort keys %{ $src2tgt21->{$src} } ) { warn "$src->$tgt\n"; } } } sub reachable { keys %{ reachable21(@_) }; } sub reachable21 { my ( $src2tgt21, @next ) = @_; my %r21; while ( @next = grep { !$r21{$_} } map { keys %{ $src2tgt21->{$_} } } @next ) { $r21{$_} = 1 for @next; } #warn join(',', @_), ' ->> ', join(',', sort keys %r21), "\n"; \%r21; } sub cycles_with_map { # this is not the most efficient method my $trcl = trans_close(@_); my %node2repr; my @cycles; foreach my $src ( keys %$trcl ) { next if defined $node2repr{$src}; my $tgt21 = $$trcl{$src}; my @rest; my $repr; foreach my $tgt ( grep { !defined $node2repr{$_} } keys %$tgt21 ) { if ( defined $trcl->{$tgt}->{$src} ) # a cycle { $node2repr{$tgt} = $repr = $src; push( @rest, $tgt ) if $tgt ne $src; #warn "$src <- $tgt\n"; } #else { warn "$src ', $src, @rest, $src), "\n"; } } ( \@cycles, \%node2repr ); } sub cycles # produces an arrayref with arrays listing the nodes of each strongly connected component { ( cycles_with_map(@_) )[0]; } sub cycle_map # produces an hashref that maps each node to the same node in their strongly conected component { ( cycles_with_map(@_) )[1]; } sub sorted_cycles # returns an unsorted list of arrays, each listing the sorted nodes of a cycle { [ map { [ sort @$_ ] } @{ cycles(@_) } ]; } sub cycle_graph # returns arcs constituting a cycle per connected component { pairs2arcs( map { pairs( @$_, $_->[0] ) } @{ sorted_cycles(@_) } ); } sub petal_graph # returns arcs constituting a petal per connected component { pairs2arcs( map { petals(@$_) } @{ sorted_cycles(@_) } ); } sub cycles_only # returns the arcs on cycles { my ($src2tgt21) = @_; my $node2repr = cycle_map($src2tgt21); my $in_same_component = sub { my ( $sr, $tr ) = map { $node2repr->{$_} } @_; defined($sr) && defined($tr) && $sr eq $tr; }; arcs_for_which( $src2tgt21, $in_same_component ); }