#!/usr/bin/env perl
#
# csvtrans - transform CSV files, interpreted as directed graphs
#
# $Id$

use warnings;
use strict;

use Getopt::Std;
use Text::CSV_XS;
use List::Util qw(first);

my %opt;
getopts( 'h?b:f:e:pq:s:t:u:x:F:X:', \%opt );

HELP_MESSAGE() if grep { /^[h?]/ } keys %opt;

sub oomph { warn join( ' ', "fatal error:", @_ ); die "for help: $0 -h\n"; }

my $sep    = $opt{'t'} // ',';
my $outsep = $opt{'u'} // $sep;
my $quote  = $opt{'q'} // '';
my $print_paths = $opt{'p'};

my @operation = grep { /\S/ } split( /\s*;+\s*/, $opt{'e'} // '' );

# correct for any ;s within regexps - note: make a real parser
my $undel_matchop_rx = qr'^m([/][^/]*|#[^#]*)$';    # an undelimited regexp op

for ( my $i = 0 ; $i < $#operation ; ++$i )
{
  if ( $operation[$i] =~ $undel_matchop_rx )
  {
    splice( @operation, $i, 2, $operation[$i] . $operation[ $i + 1 ] );
    --$i;
  }
}

my $csvr = new Text::CSV_XS(
  {
    sep_char    => $sep,
    eol         => $/,
    quote_space => 0,
    quote_char  => $quote
  }
) or oomph("the CSV parser doesn't work (not installed?)");

my $csvw = new Text::CSV_XS(
  {
    sep_char    => $outsep,
    eol         => $/,
    quote_space => 0,
    quote_char  => $quote
  }
) or oomph("the CSV parser doesn't work (not installed?)");

# TO DO: turn the following options into operators,
# after introducing support for multiple graphs

my %optgraph;

foreach my $o ( grep { defined $opt{$_} } ( 'b', 'f', 'x', 's', 'F', 'X' ) )
{
  if ( ( $opt{$o} ne '-' ) && !-r $opt{$o} )
  {
    oomph("-$o argument, $opt{$o}, doesn't specify a readable input file");
  }

  $optgraph{$o} = [ graph_in_rows( rows_of_csv( $opt{$o} ) ) ];
}

@ARGV or @ARGV = ('-');

#warn "input files: ", join(', ', @ARGV), "\n";

print_graph( read_and_transform(@ARGV) );

exit(0);

#
#--- finis - incipiunt auxilia ----#
#

sub read_and_transform
{
  # we have the directed graphs given on the command line

  my @nodes;    # lists the nodes in a particular order;
  # initially, in the order they appeared in the input
  my %is_node;      # maps each node (including isolated ones) to 1
  my %src2tgt21;    # maps each arc source to a mapping of arc targets to 1
  # (duplicate arcs aren't supported at the moment)

  do
  {
    my @rows = map { @$_ } map { rows_of_csv($_) } @_;
    my ( $nodes, $src2tgt21 ) = graph_in_rows( \@rows );
    @nodes     = @$nodes;
    %src2tgt21 = %$src2tgt21;
    %is_node   = map { $_ => 1 } @nodes;
  };

  # first apply -b, -s, -f, and -x

  if ( $optgraph{b} )
  {
    my $lift = $optgraph{b}->[1];
    %src2tgt21 = %{ non0_concat( revert($lift), \%src2tgt21, $lift ) };
    my %in_result = endpoint21( \%src2tgt21 );
    # we don't want any additional nodes
    @nodes = grep { $in_result{$_} } @{ $optgraph{b}->[0] };
    %is_node = map { $_ => 1 } @nodes;
  }

  if ( $optgraph{s} )
  {
    my ( $src2tgt21, $nodes ) =
     concat2( \@nodes, \%src2tgt21, @{ $optgraph{s} } );
    %src2tgt21 = %$src2tgt21;
    @nodes     = @$nodes;
    %is_node   = map { $_ => 1 } @nodes;
  }

  if ( $optgraph{x} )
  {
    remove_nodes( \%src2tgt21, \%is_node, @{ $optgraph{x}->[0] } );
  }

  if ( $optgraph{f} )
  {
    my %in_f = map { $_ => 1 } @{ $optgraph{f}->[0] };
    #warn( scalar( keys %in_f ), "filter nodes\n" );
    remove_nodes( \%src2tgt21, \%is_node,
      grep { !defined $in_f{$_} } keys %is_node );
  }

  if ( $optgraph{X} )
  {
    # add the 'characteristic function' for the complement of -X
    my %is_X_node = map { $_ => 1 } @{ $optgraph{X}->[0] };

    foreach my $node ( keys(%is_node) )
    {
      $src2tgt21{$node}->{ 0 + defined( $is_X_node{$node} ) } = 1;
    }
  }

  if ( $optgraph{F} )
  {
    # add the 'characteristic function' for -F
    my %is_F_node = map { $_ => 1 } @{ $optgraph{F}->[0] };

    foreach my $node ( keys(%is_node) )
    {
      my $occurs = 0 + !defined( $is_F_node{$node} );
      $is_node{$occurs} = 1;
      $src2tgt21{$node}->{$occurs} = 1;
    }
  }

  # then apply -e

  my %cmp = (
    'eq' => sub { $_[0] eq $_[1] },
    'ge' => sub { $_[0] ge $_[1] },
    'gt' => sub { $_[0] gt $_[1] },
    'le' => sub { $_[0] le $_[1] },
    'lt' => sub { $_[0] lt $_[1] },
    'ne' => sub { $_[0] ne $_[1] }
  );

  # hack: add case insensitive comparisons
  foreach my $cmp ( keys %cmp )
  {
    $cmp{"i$cmp"} = sub {
      $cmp{$cmp}( map { lc } @_ );
     }
  }

  foreach my $op (@operation)
  {
    if ( $op eq 'chain' )
    {
      my %src2prevtgt;
      my %new_src2tgt21;

      # go through the arcs, sorted by node order
      my $node_cmp = by_orig_order( \@nodes, \%is_node );

      foreach my $src ( sort $node_cmp keys %src2tgt21 )
      {
        foreach my $tgt ( sort $node_cmp keys %{ $src2tgt21{$src} } )
        {
          my $newsrc = $src2prevtgt{$src} // $src;
          $new_src2tgt21{$newsrc}->{$tgt} = 1;
          $src2prevtgt{$src} = $tgt;
        }
      }

      %src2tgt21 = %new_src2tgt21;    # the list of nodes doesn't change
    }
    elsif ( $op eq 'square' )
    {
      %src2tgt21 = %{ squared( \%src2tgt21 ) };
    }
    elsif ( $op eq 'close' )
    {
      # replace the directed graph with its transitive closure
      %src2tgt21 = %{ trans_close( \%src2tgt21 ) };
    }
    elsif ( $op eq 'cycles' )
    {
      # remove all arcs that are not part of any cycles
      %src2tgt21 = %{ cycles_only( \%src2tgt21 ) };
    }
    elsif ( $op eq 'noncycles' )
    {
      # remove all arcs that are part of cycles
      remove_arcs( \%src2tgt21, cycles_only( \%src2tgt21 ) );
    }
    elsif ( $op eq 'cyclify' )
    {
      # replace all connected components with cycles on their nodes
      my $new = cycle_graph( \%src2tgt21 );
      remove_arcs( \%src2tgt21, cycles_only( \%src2tgt21 ) );
      add_arcs( \%src2tgt21, $new );
    }
    elsif ( $op eq 'petalify' )
    {
      # replace all connected components with arc pairs on their nodes
      my $new = petal_graph( \%src2tgt21 );
      remove_arcs( \%src2tgt21, cycles_only( \%src2tgt21 ) );
      add_arcs( \%src2tgt21, $new );
    }
    elsif ( $op eq 'contract' )
    {
      # reduce all cycles in the graph to a single node
      my %rest2first = map {
        my ( $first, @rest ) = @$_;
        map { $_ => $first } @rest;
       }
       grep { @$_ > 1 } @{ sorted_cycles( \%src2tgt21 ) };

      add_arcs( \%src2tgt21,
        substituted_arcs( \%src2tgt21, 'e', \%rest2first ) );
      remove_nodes( \%src2tgt21, \%is_node, keys %rest2first )
       ;    # QQQ swap statements?
    }
    elsif ( $op eq 'tsort' )
    {
      @nodes =
       @{ isolated_nodes( \%src2tgt21, \%is_node ),
        non0_topsort( \%src2tgt21 ) };
    }
    elsif ( $op eq 'reduce' )
    {
      %src2tgt21 = %{ trans_reduce( \%src2tgt21 ) };
    }
    elsif ( $op eq 'rmarcs' )
    {
      # remove all arcs
      %src2tgt21 = ();
    }
    elsif ( $op eq 'rev' )
    {
      # reverse the graph
      %src2tgt21 = %{ revert( \%src2tgt21 ) };

      # the nodes stay the same
    }
    elsif ( $op eq 'sym' )
    {
      # add the reverse graph
      while ( my ( $src, $tgt21 ) = each %src2tgt21 )
      {
        foreach my $tgt ( keys %$tgt21 )
        {
          $src2tgt21{$tgt}->{$src} = 1;
        }
      }

      # the nodes stay the same
    }
    elsif ( $op eq 'tgtcmp' )
    {
      # replace the directed graph with the one obtained by
      # comparing the target nodes per source node
      my %new_src2tgt21;

      foreach my $src1 ( keys %src2tgt21 )
      {
        foreach my $src2 ( keys %src2tgt21 )
        {
          if (
            !grep { !defined $src2tgt21{$src2}->{$_} }
            keys %{ $src2tgt21{$src1} }
           )
          {
            # all of $src1's targets are also targets of $src2
            # ($src2 includes $src1, when both are regarded as sets of targets)
            $new_src2tgt21{$src2}->{$src1} = 1;
          }
        }
      }

      %src2tgt21 = %new_src2tgt21;

      # this result is reflexive, so the nodes stay the same
    }
    elsif ( defined( my $cmpop = first { index( $op, $_ ) == 0 } keys %cmp ) )
    {
      my ( $cmper, $premodifier ) = (
        $cmp{$cmpop},
        ( $cmpop ne $op ? substr( $op, length($cmpop) + 1 ) : undef )
      );

      # retain only the arcs $a->$b for which $a $op $b
      if ( !defined($premodifier) || $premodifier eq '' )
      {
        # no premodification requested
        %src2tgt21 = %{ filtered_arcs( \%src2tgt21, $cmper ) };
      }
      elsif ( my ( $ld, $rx, $rd, $mods ) =
        $premodifier =~ m'^s([\/#])(.*)([^msixpogce])([msixpogce]*)$' )
      {
        if ( my ( $crit_rx, $repl, $use_g ) =
          match_or_subst( 's', $ld, $rx, $rd, $mods ) )
        {
          my %subst = node_substitution( \%is_node, $crit_rx, $repl, $use_g );
          # apply the premodification before comparing
          %src2tgt21 = %{
            filtered_arcs(
              \%src2tgt21,
              sub {
                &$cmper( map { $subst{$_} } @_ );
              }
            )
          };
          #warn "filtering leaves ", scalar( keys %src2tgt21 ), " arcs\n";
        }
        else
        {
          warn
           "uninterpretable pre-substitution, ignoring filter operation: $op\n";
        }
      }
      else
      {
        warn "malformed pre-substitution, ignoring filter operation: $op\n";
      }

      # note that no nodes are removed
    }
    elsif ( $op eq 'addid' )
    {
      # create arcs $node -> $node for every node
      $src2tgt21{$_}->{$_} = 1 for keys %is_node;
      # the set of nodes stays the same
    }
    elsif ( grep { $op eq $_ } ( 'in', 'out', 'io' ) )
    {
      # replace all arcs with the in-, out- or total degree of each node
      my ( $outdegree, $indegree ) = all_degrees( \%src2tgt21, \%is_node );
      my %degree =
         $op eq 'in'  ? %$indegree
       : $op eq 'out' ? %$outdegree
       :   map { $_ => $indegree->{$_} + $outdegree->{$_} } keys %is_node;
      my %new_src2tgt;

      while ( my ( $n, $d ) = each %degree )
      {
        $is_node{$d} = 1;
        $new_src2tgt{$n}->{$d} = 1;
      }
      %src2tgt21 = %new_src2tgt;
    }
    elsif (
      my (
        $ends, $sep, $neg, $crit, $dir,  $cmp, $n,
        $mors, $ld,  $rx,  $rd,   $mods, $body
      )
      = $op =~
m'^(l|r|b|e|)(:?)(!|)((in|out|io)(<|=|>)(\d+)|(m|s)([\/#])(.*)([^msixpogce])([msixpogce]*)|{(?<G>(?:[^{}]*+|{(?&G)})*)})$'
     )
    {
      # without $ends, retain only nodes that pass the given criterion
      # with $ends, retain only arcs whose end(s) pass the given criterion

      if ( defined $body )
      {
        # {body} is a shorthand for s/.*/$body/ except with {} delimiters
        ( $mors, $ld, $rx, $rd, $mods ) = ( 's', '}', ".*}$body", '}', '' );
      }

      # first, determine the nodes that do *not* pass the given criterion
      my @failing_nodes;

      # or in case of a substitution, which substitutions to make
      my %subst;

      if ($dir)
      {
        # $crit is a degree criterion
        # compute the indegrees and outdegrees of all nodes
        my $passes =
           $cmp eq '<' ? sub { $_[0] < $n }
         : $cmp eq '=' ? sub { $_[0] == $n }
         :               sub { $_[0] > $n };

        my $in  = ( $dir ne 'out' );
        my $out = ( $dir ne 'in' );

        my ( $outdegree, $indegree ) = non0_degrees( \%src2tgt21 );
        @failing_nodes = grep {
          my ( $in, $out ) = ( $indegree->{$_} // 0, $outdegree->{$_} // 0 );

          #warn "indegree($_) = $in\n";
          #warn "outdegree($_) = $out\n";

          my $passes =
              ( $dir eq 'in' && !&$passes($in) )
           || ( $dir eq 'out' && !&$passes($out) )
           || ( $dir eq 'io'  && !&$passes( $in + $out ) );

          $neg xor $passes    # if $neg is '!', reverse the condition
        } keys %is_node;
      }
      elsif ( $neg && $mors eq 's' )
      {
        warn "negated substitution is nonsensical, treated as no-op: $op\n";
      }
      else
      {
        # $crit is a regex operation
        #   with contents $rx and delimiters $ld and $rd

        my ( $crit_rx, $repl, $use_g ) =
         match_or_subst( $mors, $ld, $rx, $rd, $mods );

        if ($crit_rx)
        {
          if ( $mors eq 'm' )
          {
            warn "g ignored, doesn't make sense in match operation $op\n"
             if $use_g;
            @failing_nodes = grep { $neg xor !/$crit_rx/ } keys %is_node;
          }
          else
          {
            %subst = node_substitution( \%is_node, $crit_rx, $repl, $use_g );
          }
        }
        else
        {
          my $match = ( $mors eq 'm' ) ? 'match' : 'substitution';
          warn "$match operation with invalid pattern treated as no-op: $op\n";
        }
      }

      $mors = 'm' if !defined $mors;

      if ( $ends eq '' )
      {
        if ( $mors eq 'm' )
        {
          # remove all nodes that do *not* meet the given criterion
          remove_nodes( \%src2tgt21, \%is_node, @failing_nodes );
        }
        else
        {
          # perform the substitution on all nodes (and arc endpoints)
          %src2tgt21 = %{ substituted_arcs( \%src2tgt21, $ends, \%subst ) };
          %is_node =
           map { ( defined $subst{$_} ? $subst{$_} : $_ ) => 1 } keys %is_node;
        }
      }
      elsif ( $mors eq 'm' )
      {
        # remove all arcs of which the indicated end point(s)
        # do(es) *not* meet the given criterion
        my %fails = map { $_ => 1 } @failing_nodes;

        while ( my ( $src, $tgt21 ) = each %src2tgt21 )
        {
          if ( $fails{$src} && $ends =~ /l|b/ )
          {
            #warn "removing $src -> *\n";
            delete $src2tgt21{$src};
          }
          else
          {
            foreach my $tgt ( keys %$tgt21 )
            {
              if ( $fails{$tgt}
                && ( $ends =~ /r|b/ || ( $ends =~ /e/ && $fails{$src} ) ) )
              {
                #warn "removing $src -> $tgt\n";
                delete $tgt21->{$tgt};
                delete $src2tgt21{$src}->{$tgt};
              }
            }

            if ( !%$tgt21 )
            {
              #warn "removed all arcs for $src\n";
              delete $src2tgt21{$src};
            }
          }
        }
      }
      else
      {
        if ( $ends eq 'b' )
        {
          #warn "interpreting b as e in the substitution command $op\n";
        }

        # perform the substitution on all indicated arc end point(s),
        # but if only on one end, don't remove any nodes on the other end

        # first, compute the new arcs
        my $new_src2tgt21 = substituted_arcs( \%src2tgt21, $ends, \%subst );

        # update the arcs, but also update the set of nodes:
        # a node disappears if it no longer occurs as an end point
        # and appears if it didn't yet occur as an end point
        my %oldnonendpoint21 =
         map { $_ => 1 } isolated_nodes( \%src2tgt21, \%is_node );
        %src2tgt21 = %$new_src2tgt21;
        %is_node = ( %oldnonendpoint21, endpoint21($new_src2tgt21) );
        #warn(" the nodes are now: ",join(', ', keys %is_node), "\n");
        #warn "the new arcs are now:\n"; dump_arcs(\%src2tgt21);
      }
    }
    else
    {
      warn "unrecognized operation treated as no-op: $op\n";
    }
  }

  ( \@nodes, \%is_node, \%src2tgt21 )

}

sub non0_topsort
# yields a reference to the list of arc endpoints in topological sort order
{
  my ($src2tgt21) = @_;

  my %rest2first = map {
    my ( $first, @rest ) = @$_;
    map { $_ => $first } @rest;
   }
   grep { @$_ > 1 } @{ sorted_cycles($src2tgt21) };

  my ( $out, $in ) = non0_degrees($src2tgt21);
  my @current = grep { !defined $in->{$_} } keys %$out;
  my @topsorted;

  while ( my $src = shift(@current) )
  {
    push( @topsorted, $src );

    foreach my $tgt ( keys %{ $src2tgt21->{$src} } )
    {
      if ( !--$in->{$tgt} )
      {
        push( @current, $tgt );
      }
    }
  }

  [@topsorted];
}

sub match_or_subst
# parses and interprets, but does not execute, a regexp-using operation
{
  my ( $mors, $ld, $rx, $rd, $mods ) = @_;

  my $op = "$ld$rx$ld$rd$mods";
  my $repl;

  if ( $mors eq 's' )
  {
    # it is a substitution operation, so $rx must contain
    # a delimiter, followed by the (possibly empty) replacement
    if ( ( my $delpos = index( $rx, $ld ) ) >= 0 )
    {
      ( $rx, $repl ) =
       ( substr( $rx, 0, $delpos ), substr( $rx, $delpos + 1 ) );
    }
    else
    {
      warn "assuming an empty replacement in substitution operation $op\n";
      $repl = '';
    }

    if ( $ld ne $rd )
    {
      # cross fingers - perhaps the closing delimiter was omitted
      warn "adding closing delimiter on substitution operation $op\n";
      $repl .= $rd;    # equivalent, but harder to formulate
    }
  }
  elsif ( $ld ne $rd )
  {
    # cross fingers - perhaps the closing delimiter was omitted
    warn "adding closing delimiter on match operation $op\n";
    $rx .= $rd;        # equivalent, but harder to formulate
  }

  my $use_g = 0;

  if ($mods)
  {
    # we can't just suffix the operation with $mods, unfortunately
    my $non_g_mods = $mods;
    $use_g = ( $non_g_mods =~ s/g//g );
    $rx = "(?$non_g_mods)$rx" if $non_g_mods;
  }

  ( eval { $ld eq '/' ? qr/$rx/ : qr#$rx# }, $repl, $use_g );
}

sub node_substitution
# yields the substitution s/$crit_rx/$repl/g?
# on the given nodes, returning the result as a function (hash)
{
  my ( $is_node, $crit_rx, $repl, $use_g ) = @_;

  map {
    my $orig = $_;
    if   ($use_g) { s/$crit_rx/qq(qq($repl))/eeg }    # thank you,
    else          { s/$crit_rx/qq(qq($repl))/ee; }    # anno@#perl
    ( $orig, $_ )
  } keys %$is_node;
}

sub print_graph
# print the directed graph, with the nodes in original order
# (and any non-original nodes following in arbitrary order);
# this is important if we want to compare results with originals
# using tools that depend on order, such as diff
{
  my ( $oldnodes, $is_node, $src2tgt21 ) = @_;

  # list all nodes, with the original ones first, in original order
  my $node_cmp = by_orig_order( $oldnodes, $is_node );

  # print the isolated nodes in node order
  print_rows(
    map { [$_] }
    sort $node_cmp isolated_nodes( $src2tgt21, $is_node )
  );

  if ( !$print_paths )    # no -p
  {
    # print the arcs, sorted by node order
    foreach my $src ( sort $node_cmp keys %$src2tgt21 )
    {
      print_rows(
        map { [ $src, $_ ] }
        sort $node_cmp keys %{ $src2tgt21->{$src} }
      );
    }
  }
  else
  {
    # remove all cycle arcs
    my $wo_cycles = copy($src2tgt21);
    remove_arcs( $wo_cycles, cycles_only($src2tgt21) );
    # compute all maximal paths
    # we can do it in one pass if we visit the nodes in the right order
    my %src2paths;    # each value is an arrayref of arrayrefs, each a path
    my %initial21;    # is this node at the start of a path?

    my @bottomsorted = reverse @{ non0_topsort($wo_cycles) };
    #warn "bottomsorted nodes: @bottomsorted\n";
    foreach my $src (@bottomsorted)
    {
      $initial21{$src} = 1;

      if ( defined( $wo_cycles->{$src} ) )
      {
        my @tgts = sort $node_cmp keys %{ $wo_cycles->{$src} };
        #warn "$src has ",  scalar(@tgts), " children: @tgts\n";
        my @tgtpaths = @tgts ? map { $src2paths{$_} } @tgts : [ [] ];
        @tgtpaths = map { @$_ } @tgtpaths;
        #warn "$src has ",  scalar(@tgtpaths), " continuation paths\n";
        $src2paths{$src} = [ map { [ $src, @$_ ] } @tgtpaths ];
        delete $initial21{$_} for @tgts;
      }
      else
      {
        $src2paths{$src} = [ [$src] ];
        warn "paths for leaf $src: ", @{ $src2paths{$src} }, "\n";
      }
    }

    #warn "initial: $_\n" for sort $node_cmp keys %initial21;

    print_rows( map { @{ $src2paths{$_} } } sort $node_cmp keys %initial21 );
  }
}

sub by_orig_order
# orders all nodes stably (that is why it adds the new ones first)
{
  my ( $oldnodes, $is_node ) = @_;

  my $last     = $#$oldnodes;
  my %node2idx = map { $oldnodes->[$_] => $_ } 0 .. $last;
  my @newnodes = grep { !defined $node2idx{$_} } keys %$is_node;
  $node2idx{$_} = ++$last for @newnodes;

  sub { $node2idx{$a} <=> $node2idx{$b} };
}

sub HELP_MESSAGE
{
  print STDERR <<ZZ;

Usage: $0 [-2] [-t sep] [-f filea] [-x fileb] [-F filec] [-X filed] [-e ops] [file1 [file2 [...]]]

to interpret the input file(s), and/or standard input,
as a directed graph, optionally transform it, and output it.
The input must be in CSV format; the output will be in CSV format as well.

Input interpretation:
Lines with multiple fields are interpreted as stating paths.
Lines with a single field are interpreted as stating node names;
this is only required to produce isolated nodes in the output.

Options:

  -p

    Output isolated nodes and not pairs, but sequences of nodes, namely,
    all longest paths in which no arc is part of a cycle.

  -q char

    Use char as the quote character (instead of none at all).

  -t sep

    Use sep (a string) as field separator (instead of a comma).

  -u sep

    Use sep as output field separator (instead of the input field separator).

  -b filename

    Lift the input graph by the graph given in the filename,
    i.e., csvtrans -b S R outputs S^-1.R.S, so it is a shorthand for
    csvtrans -e rev S | csvtrans -s R | csvtrans -s S.
    This is applied before any other operations.

  -s filename

    Replace the graph with the relational composition of it
    and the graph in the given CSV file (not the other way round).
    This is applied before any operations except -b.

  -f filename

    Limit the graph to the nodes that occur in the given CSV file.
    This is applied before any operations except -b and -s.

  -x filename

    Limit the graph to the nodes that do not occur in the given CSV file.
    This is applied before any operations except -b and -s.

  -F filename

    Add the characteristic function telling whether nodes occur
    in the given CSV file, i.e. arcs (n, 0) or (n, 1) depending on
    whether node n occurs in the file.

  -X filename

    Add the characteristic function telling whether nodes do *not* occur
    in the given CSV file, i.e. arcs (n, 1) or (n, 0) depending on
    whether node n occurs in the file.

  -e ops

    Apply the given operations, in order.
    They must be specified as a single string, separated with a semicolon.
    Each operation modifies the directed graph obtained from the input:

      square  replaces its arcs with its paths of length 2
      close   replaces it with its transitive closure
      reduce  transitively reduces it (not touching cycles)
      cycles  removes all arcs that are not part of any cycle
   noncycles  removes all arcs that are part of any cycle
     cyclify  replaces all arcs in a strong component
              with single cycles on the nodes of that component
    petalify  replaces all arcs in a strong component
              with arc pairs on the nodes of that component
    contract  reduces each cycle to one node (the alphabetically first)
      rev     reverses it
      sym     adds the reverse (producing its symmetric closure)
      rmarcs  removes all arcs
      tgtcmp  replaces the graph with the pairwise comparison
                of sets of targets of nodes
      addid   adds self-loops on all nodes (that don't already have them)
      tsort   orders the nodes topologically
              (by default, the input order is preserved)
      chain   for all input pairs (x, y), outputs either
                (y', y) for the latest earlier pair (x, y'), or
                (x, y)  if no such pair exists
              (hint: use this with -p, but any cycles will be removed,
               and beware of crossing paths)
      in      replaces all arcs with the function
      out     n -> d for each node n, where d is n's 
      io      indegree, outdegree, or indegree+outdegree, respectively

      cmp     retains only the arcs \$a -> \$b for which \$a cmp \$b
      cmp:subst
              retains only the arcs \$a -> \$b for which sa cmp sb
              where sa, sb are the result of applying substitution
              subst to \$a and \$b, respectively

        where cmp is one of Perl's string comparison operators

           eq, ne, gt, ge, lt, le

        or, for case-insensitive comparison, the same preceded with i:

           ieq, ine, igt, ige, ilt, ile

        and subst is a Perl substitution command, e.g. s#.(.).*#\$1#
        (see details below);
        so ieq is a shorthand for eq:s#.*#\L\$&#, etcetera

      crit    retains only the nodes that meet crit
      l:crit  retains only the arcs whose source meets crit
      r:crit  retains only the arcs whose target meets crit
      e:crit  retains only the arcs whose source or target meets crit
      b:crit  retains only the arcs whose source and target meets crit

        where crit is one of:

          in=0, in>0, in=1, in>1, in<2, in=2, in>2, ...
            a constraint on the node's number of incoming arcs

          out=0, out>0, out=1, ...
            a constraint on the node's number of outgoing arcs

          io=0, io>0, io=1, ...
            a constraint on the node's total number of arcs

          m/regexp/msixpoce, m#regexp#msixpoce
            a constraint on the node name;
            regexp is a Perl regular expression
              that does not contain its delimiter (/ or #);
            msixpoce is zero of more of the characters in msixpoge,
              and serves as a (Perl) matching modifier

        and crit may be prefixed with ! to negate it;

      subst    modifies nodes
      l:subst  modifies arc sources
      r:subst  modifies arc targets
      e:subst  modifies arc sources and targets

        where subst is one of:

          s/regexp/repl/msixpogce, s#regexp#repl/msixpogce
            a (Perl) substitution applied to the node name;
            regexp and msixpogce are as before, except that g is allowed;
            repl is the substituted value; or

          {repl}
            which is a shorthand for s/.*/repl/;

        if arcs are being modified, four strings have special meaning in repl:

          %n  stands for the arc endpoint being modified
          %o  stands for the other endpoint
          %l  stands for the arc source
          %r  stands for the arc target

        Example:

          e:s/.*/%o/g
          e:{%o}

        are both equivalent to

          rev

ZZ

  exit(0);
}

sub rows_of_csv
{
  my @rows;

  my ($f) = @_;

  my $infh;

  if ( $f eq '-' )
  {
    $infh = *STDIN;
  }
  elsif ( !open( $infh, '<', $f ) )
  {
    warn "warning: cannot read from $f\n";
  }

  while ( my $row = $csvr->getline($infh) )
  {
    push( @rows, $row );
  }
  #warn "read ", scalar(@rows), " rows from $f\n";

  \@rows;
}

sub print_rows
{
  foreach my $row (@_)
  {
    $csvw->print( *STDOUT, $row );
  }
}

sub graph_in_rows
{
  my ( @nodes, %is_node, %src2tgt21 );

  foreach my $row ( @{ $_[0] } )
  {
    foreach my $node (@$row)
    {
      if ( !defined $is_node{$node} )
      {
        push( @nodes, $node );
        $is_node{$node} = 1;
        #warn "added node $node\n";
      }
    }

    if ( @$row > 1 )
    {
      # 2 or more rows - a path
      foreach my $arc ( pairs(@$row) )
      {
        # an arc on the path
        $src2tgt21{ $arc->[0] }->{ $arc->[1] } = 1;
        #warn "added arc $arc->[0] -> $arc->[1] \n";
      }
    }
  }

  ( \@nodes, \%src2tgt21 );
}

sub pairs
{
  #warn 'pairs(', join(', ', @_), ")\n";

  if ( !@_ )
  {
    ();
  }
  elsif ( @_ == 1 )
  {
    ( [] );
  }
  else
  {
    map { [ $_[$_], $_[ $_ + 1 ] ] } 0 .. $#_ - 1;
  }
}

sub petals
{
  #warn 'petals(', join(', ', @_), ")\n";
  if ( !@_ )
  {
    ();
  }
  elsif ( @_ == 1 )
  {
    ( [] );
  }
  else
  {
    map { [ $_[0], $_[$_] ], [ $_[$_], $_[0] ] } 1 .. $#_;
  }
}

sub remove_nodes
{
  my ( $src2tgt21, $is_node ) = @_;

  delete $is_node->{$_} for @_;

  while ( my ( $src, $tgt21 ) = each %$src2tgt21 )
  {
    if ( !$is_node->{$src} )
    {
      delete $src2tgt21->{$src};
    }
    else
    {
      foreach my $tgt ( keys %$tgt21 )
      {
        if ( !$is_node->{$tgt} )
        {
          delete $tgt21->{$tgt};
        }
      }
      delete $src2tgt21->{$src} if !%$tgt21;
    }
  }
}

sub add_arcs
{
  my ( $src2tgt21, $new_src2tgt ) = @_;

  while ( my ( $src, $n_tgt21 ) = each %$new_src2tgt )
  {
    foreach my $tgt ( keys %$n_tgt21 )
    {
      #warn "adding $src->$tgt\n" if !defined $src2tgt21->{$src}->{$tgt};
      $src2tgt21->{$src}->{$tgt} = 1;
    }
  }
}

sub remove_arcs
{
  my ( $src2tgt21, $doomed_src2tgt ) = @_;

  while ( my ( $src, $n_tgt21 ) = each %$doomed_src2tgt )
  {
    my $tgt21 = $src2tgt21->{$src};
    delete $tgt21->{$_} for keys %$n_tgt21;
    delete $src2tgt21->{$src} if !%$tgt21;
  }
}

sub has_arc
{
  my ( $src2tgt21, $src, $tgt ) = @_;
  defined $src2tgt21->{$src}->{$tgt};
}

sub arcs_for_which
{
  my ( $src2tgt21, $arcprop ) = @_;

  my %new_src2tgt;

  while ( my ( $src, $tgt21 ) = each %$src2tgt21 )
  {
    foreach my $tgt ( keys %$tgt21 )
    {
      if ( &$arcprop( $src, $tgt ) )
      {
        ++$new_src2tgt{$src}->{$tgt} if !$new_src2tgt{$src}->{$tgt};
      }
    }
  }

  \%new_src2tgt;
}

sub pairs2arcs
{
  my %src2tgt21;

  foreach my $p (@_)
  {
    my ( $src, $tgt ) = @$p;
    $src2tgt21{$src}->{$tgt} = 1;
  }
  \%src2tgt21;
}

sub non0_degrees
# does not include degrees of 0
# argument: \%src2tgt21
{
  my ( %outdegree, %indegree );

  while ( my ( $src, $tgt21 ) = each %{ $_[0] } )
  {
    foreach my $tgt ( keys %$tgt21 )
    {
      ++$outdegree{$src};
      ++$indegree{$tgt};
    }
  }

  ( \%outdegree, \%indegree );
}

sub all_degrees
# does include degrees of 0
# arguments: \%src2tgt21, \%is_node
{
  my $is_node = $_[1];
  die "BUG: missing or wrong 2nd argument to all_degrees()\n"
   if !defined($is_node) || ref $is_node ne 'HASH';

  my ( $outdegree, $indegree ) = non0_degrees( $_[0] );
  $outdegree->{$_} = 0 for grep { !defined $outdegree->{$_} } keys %$is_node;
  $indegree->{$_}  = 0 for grep { !defined $indegree->{$_} } keys %$is_node;
  ( ( $outdegree, $indegree ) );
}

sub srcs_and_tgts
{
  map { [ keys %$_ ] } non0_degrees( $_[0] );
}

sub srcs_not_tgts
{
  my ( $out, $in ) = non0_degrees( $_[0] );
  [ grep { !defined $in->{$_} } keys %$out ];
}

sub targets
{
  my ( $out, $in ) = non0_degrees( $_[0] );
  [ keys %$in ];
}

sub endpoint21
# maps each node that is an arc source or target to 1
{
  map { $_ => 1 } map { keys %$_ } non0_degrees( $_[0] );
}

sub endpoints
# lists the nodes that are an arc source and/or target
{
  my %ep21 = endpoint21( $_[0] );
  keys %ep21;
}

sub isolated_nodes
# lists the nodes that are no arc source or target
{
  my ( $src2tgt21, $is_node ) = @_;

  my %ep21 = endpoint21($src2tgt21);
  grep { !defined $ep21{$_} } keys %$is_node;
}

sub filtered_arcs
# lists the arcs that satisfy the given arc predicate
{
  my ( $src2tgt21, $arc_passes ) = @_;
  my %new_src2tgt21;

  while ( my ( $src, $tgt21 ) = each %$src2tgt21 )
  {
    foreach my $tgt ( keys %{ $src2tgt21->{$src} } )
    {
      if ( &$arc_passes( $src, $tgt ) )
      {
        $new_src2tgt21{$src}->{$tgt} = 1;
        #warn "arc_filter($src->$tgt) passes\n";
      }
    }
  }
  \%new_src2tgt21;
}

sub substituted_arcs
# computes the substitute arcs according to
#  + the endpoint specification $end;
#  + the *node* substitution function %$subst
#    ('l' for sources only, 'r' for targets only, 'e' for both ends)
# if $subst->{$src} or $subst->{$tgt} is undef, the arc in question is omitted,
# but if it does not *exist*, the end point in question remains unchanged
{
  my ( $src2tgt21, $ends, $subst ) = @_;
  my %new_src2tgt21;

#warn "substituting (at $ends):\n", map { "$_ -> $subst->{$_}\n" } sort keys %$subst;

  while ( my ( $src, $tgt21 ) = each %$src2tgt21 )
  {
#warn "within arcs:\n", map { "$src -> $_\n" } sort keys %$tgt21;
    my $newsrc = $src;

    if ( $ends ne 'r' && exists $subst->{$src} )
    {
      #warn "substituting $src -> $subst->{$src}\n";
      $newsrc = $subst->{$src};
      next if !defined $newsrc;
    }

    foreach my $tgt ( keys %{ $src2tgt21->{$src} } )
    {
      my $newtgt = $tgt;

      if ( $ends ne 'l' && exists $subst->{$tgt} )
      {
        #warn "substituting $tgt -> $subst->{$tgt}\n";
        $newtgt = $subst->{$tgt};
      }

      next if !defined $newtgt;

      # only now, substitute %l and %r in $newsrc and $newtgt
      my $newersrc = join( $src, split( /%[ln]/, $newsrc, -1 ) );
      $newtgt   = join( $tgt, split( /%[rn]/, $newtgt,   -1 ) );
      $newersrc = join( $tgt, split( /%[ro]/, $newersrc, -1 ) );
      $newtgt   = join( $src, split( /%[lo]/, $newtgt,   -1 ) );
      #warn "substituting $src -> $tgt with $newersrc -> $newtgt\n";
      $new_src2tgt21{$newersrc}->{$newtgt} = 1;
    }
  }

#warn "arcs after substitution:\n", map { my $src = $_; map { "$src -> $_\n" } sort keys %{$new_src2tgt21{$src}} } sort keys %new_src2tgt21;
  \%new_src2tgt21;
}

sub copy
# %$src2tgt21 is not a *deep* copy of $src2tgt21
{
  my ($src2tgt21) = @_;

  my %copy;

  while ( my ( $src, $tgt21 ) = each %$src2tgt21 )
  {
    $copy{$src} = {%$tgt21};    # a copy
  }

  \%copy;
}

sub revert
{
  my ($src2tgt21) = @_;

  my %tgt2src21;

  while ( my ( $src, $tgt21 ) = each %$src2tgt21 )
  {
    foreach my $tgt ( keys %$tgt21 )
    {
      $tgt2src21{$tgt}->{$src} = 1;
    }
  }

  \%tgt2src21;
}

sub squared_arcs
# for a relation R, yields R^2
{
  non0_concat2( $_[0], $_[0] );
}

sub non0_concat2
# for relations R, S, yields R.S, only on arcs
{
  my ( $src2mid21, $mid2tgt21 ) = @_;

  my %src2tgt21;

  while ( my ( $src, $mid21 ) = each %$src2mid21 )
  {
    foreach my $mid ( keys %$mid21 )
    {
      next if !defined $mid2tgt21->{$mid};

      foreach my $tgt ( keys %{ $mid2tgt21->{$mid} } )
      {
        $src2tgt21{$src}->{$tgt} = 1;
        #warn "$src -> $mid -> $tgt\n";
      }
    }
  }

  \%src2tgt21;
}

sub non0_concat
# for relations R1, ..., Rn, yields R1 . ... . Rn, only on arcs
{
  my $r1 = shift(@_) // die "BUG: argumentless call to concat()\n";

  if (@_)
  {
    non0_concat2( $r1, non0_concat(@_) );
  }
  else
  {
    $r1;
  }
}

sub concat2
# non0_concat2 and intersect the isolated nodes
{
  my ( $nodes1, $src2mid21, $nodes2, $mid2tgt21 ) = @_;
  my %ep1       = endpoint21($src2mid21);
  my %ep2       = endpoint21($mid2tgt21);
  my $src2tgt21 = non0_concat2( $src2mid21, $mid2tgt21 );
  my %ep3       = endpoint21($src2tgt21);

  my %is_node1 = map { $_ => 1 } @$nodes1;
  my @nodes2not1 = grep { !$is_node1{$_} } @$nodes2;
  my %isol1 = map { $_ => 1 } grep { !defined $ep1{$_} } @$nodes1;
  my %isol2 = map { $_ => 2 } grep { !defined $ep2{$_} } @$nodes2;
  my @nodes2keep =
   grep { $ep3{$_} || ( $isol1{$_} && $isol2{$_} ) } ( @$nodes1, @nodes2not1 );
  ( non0_concat2( $src2mid21, $mid2tgt21 ), [@nodes2keep] );
}

sub nodes_to_selfloops
# given a list of nodes, yields a graph with selfloops
{
  my $ref = { map { $_ => { $_ => 1 } } @_ };
  # the assignment is to force returning a hashref (!!)
}

sub nodes_with_selfloops
{
  my ($src2tgt21) = @_;

  grep { $src2tgt21->{$_}->{$_} } keys %$src2tgt21;
}

sub without_selfloops
{
  my $rs_src2tgt21 = copy( $_[0] );

  delete $rs_src2tgt21->{$_}->{$_} for keys %$rs_src2tgt21;

  $rs_src2tgt21;
}

sub trans_close
# for a relation R, yields R^+ =D= R u R^2 u R^3 u ...
{
  #my %src2tgt21 = %{ $_[0] }; # not a deep copy!
  my $src2tgt21 = copy( $_[0] );
  my $tgt2src21 = revert( $_[0] );

  foreach my $mid ( keys %$src2tgt21 )
  {
    foreach my $src ( keys %{ $tgt2src21->{$mid} } )
    {
      foreach my $tgt ( keys %{ $src2tgt21->{$mid} } )
      {
        $src2tgt21->{$src}->{$tgt} = 1;
        $tgt2src21->{$tgt}->{$src} = 1;
        #warn "$src -> $mid -> $tgt\n";
      }
    }
  }

  $src2tgt21;
}

sub trans_refl_close
# for a relation R, yields R^* =D= I u R^ u R^2 u R^3 u ...
{
  my ( $src2tgt21, $is_node ) = @_;

  my $trcl = trans_close($src2tgt21);
  $trcl->{$_}->{$_} = 1 for keys %$is_node;
  $trcl;
}

sub trans_reduce
# for a relation R, yields its smallest subset that
# - has the same transitive closure
# - has the exact same cycles
# so reduce the acyclic part of the graph; don't touch the cycles
# this took a LONG time to get right, and I'm not even sure if it is
# input graph used for testing:
#a,b,c,d,e,f,b,g,h
#a,h
#i,d,b
{
  my ($src2tgt21) = @_;

  # follow TRANSITIVE REDUCTION OF A DIRECTED GRAPH (Aho, Garey & Ullman)
  # which gives an algorithm on p. 136
  #my $wo_sl = without_selfloops( $src2tgt21 );
  #my $redundancy = squared( $wo_sl );
  # except that that algorithm is flawed (try inputting a,b,c,d; a,d)
  #my $redundancy = non0_concat2( $wo_sl, trans_close( $wo_sl ) );
  # and that we treat all arcs on cycles as having length 0 and leave them in
  # there may be a more efficient implementation ...
  my $zerosteps = cycles_only($src2tgt21);
  add_arcs( $zerosteps, nodes_to_selfloops( endpoints($src2tgt21) ) );
  my $wo_cycles = copy($src2tgt21);
  remove_arcs( $wo_cycles, $zerosteps );
  my $onestep = non0_concat2( $zerosteps, $wo_cycles );
  add_arcs( $onestep, $wo_cycles );
  add_arcs( $onestep, non0_concat2( $wo_cycles, $zerosteps ) );
  my $redundancy = non0_concat2( $onestep, trans_close($onestep) );

  my $reduced = copy($src2tgt21);
  remove_arcs( $reduced, $redundancy );
  $reduced;
}

sub dump_arcs
# for debugging
{
  my ($src2tgt21) = @_;

  foreach my $src ( sort keys %$src2tgt21 )
  {
    foreach my $tgt ( sort keys %{ $src2tgt21->{$src} } )
    {
      warn "$src->$tgt\n";
    }
  }
}

sub reachable
{
  keys %{ reachable21(@_) };
}

sub reachable21
{
  my ( $src2tgt21, @next ) = @_;
  my %r21;

  while ( @next = grep { !$r21{$_} } map { keys %{ $src2tgt21->{$_} } } @next )
  {
    $r21{$_} = 1 for @next;
  }

  #warn join(',', @_), ' ->> ', join(',', sort keys %r21), "\n";
  \%r21;
}

sub cycles_with_map
{
  # this is not the most efficient method
  my $trcl = trans_close(@_);
  my %node2repr;
  my @cycles;

  foreach my $src ( keys %$trcl )
  {
    next if defined $node2repr{$src};

    my $tgt21 = $$trcl{$src};
    my @rest;
    my $repr;

    foreach my $tgt ( grep { !defined $node2repr{$_} } keys %$tgt21 )
    {
      if ( defined $trcl->{$tgt}->{$src} )    # a cycle
      {
        $node2repr{$tgt} = $repr = $src;
        push( @rest, $tgt ) if $tgt ne $src;
        #warn "$src <- $tgt\n";
      }                                       #else { warn "$src </ $tgt\n"; }
    }

    if ( defined $repr )
    {
      # $src is on a cycle
      $node2repr{$src} = $repr;               # = $src itself
      push( @cycles, [ $src, @rest ] );
      #warn "cycle: ", join(' -> ', $src, @rest, $src), "\n";
    }
  }

  ( \@cycles, \%node2repr );
}

sub cycles
# produces an arrayref with arrays listing the nodes of each strongly connected component
{
  ( cycles_with_map(@_) )[0];
}

sub cycle_map
# produces an hashref that maps each node to the same node in their strongly conected component
{
  ( cycles_with_map(@_) )[1];
}

sub sorted_cycles
# returns an unsorted list of arrays, each listing the sorted nodes of a cycle
{
  [ map { [ sort @$_ ] } @{ cycles(@_) } ];
}

sub cycle_graph
# returns arcs constituting a cycle per connected component
{
  pairs2arcs( map { pairs( @$_, $_->[0] ) } @{ sorted_cycles(@_) } );
}

sub petal_graph
# returns arcs constituting a petal per connected component
{
  pairs2arcs( map { petals(@$_) } @{ sorted_cycles(@_) } );
}

sub cycles_only
# returns the arcs on cycles
{
  my ($src2tgt21) = @_;

  my $node2repr         = cycle_map($src2tgt21);
  my $in_same_component = sub {
    my ( $sr, $tr ) = map { $node2repr->{$_} } @_;
    defined($sr) && defined($tr) && $sr eq $tr;
  };
  arcs_for_which( $src2tgt21, $in_same_component );
}