#!/usr/bin/env perl
#
# uncomment - remove or modify comments in source code
#
# $Id$
#
# based on an earlier script, loc.pl

# NOTE: not everything works, but I only need -l c++ for now

use warnings;
use strict;

use Getopt::Std;
use List::Util 'sum';
use List::MoreUtils 'firstidx';

my %opt;
getopts( 'h?r1:2:e:l:p:q:', \%opt )
 or exit(-1);

my @lang =
 $opt{'l'} ? split( /\s+/, $opt{'l'} ) : ('c');    # languages (shortcuts)

# pairs of strings:
my %lang2q = (
  'c'    => '" "',
  'c++'  => '" "',
  'cs'   => '" "',
  'c#'   => '" "',
  'java' => '" "'
);

# pairs of strings:
my %lang22 = (
  'c'    => '/* */',
  'c++'  => '/* */',
  'cs'   => '/* */',
  'c#'   => '/* */',
  'java' => '/* */'
);

# lists of strings:
my %lang21 = (
  'c++'  => '//',
  'cs'   => '//',
  'vb'   => "'",
  'perl' => '#',
  'sh'   => '#'
);

# a regexp:
# \\\\ because one escape is for Perl, one for within the regexp
my %lang2e = (
  'c++'  => '\\\\.',
  'cs'   => '\\\\.',
  'vb'   => '""',
  'perl' => '\\\\.',
  'sh'   => '\\\\.',
);

# a regexp:
my %lang2p = ( 'perl' => '[sm$]#' );

my %is_lang =
 map { $_ => 1 } map { keys %$_ } ( \%lang2q, \%lang22, \%lang21, \%lang2e );

if ( my @wronglang = grep { !defined $is_lang{$_} } @lang )
{
  my @rightlang = sort keys %is_lang;
  die 'supported values for -l are: ', join( ', ', @rightlang ), ', not: ',
   @wronglang, "\n";
}

HELP_MESSAGE() if $opt{'h'} || $opt{'?'};

my $always_print_newlines = $opt{'r'};

@ARGV = ('-') if !@ARGV;

my $lang21 = join( ' ', grep { $_ ne '' } map { $lang21{$_} // '' } @lang );
my $lang22 = join( ' ', grep { $_ ne '' } map { $lang22{$_} // '' } @lang );
my $lang2q = join( ' ', grep { $_ ne '' } map { $lang2q{$_} // '' } @lang );
my $lang2e = join( '|', grep { $_ ne '' } map { $lang2e{$_} // '' } @lang );
my $lang2p = join( '|', grep { $_ ne '' } map { $lang2p{$_} // '' } @lang );

# TO DO: support multiple bracket pairs
#my @justopen      = split( /\s+/, $opt{'1'} // $lang21 );    # 1-line separators
#my %open2close    = split( /\s+/, $opt{'2'} // $lang22 );    # brackets
#my %quote2unquote = split( /\s+/, $opt{'q'} // $lang2q );    # quote pairs
#
#warn "comment brackets: ", join(' ', map { ($_, $open2close{$_}) } keys %open2close), "\n";
#warn "comment starters: ", join(' ', @justopen), "\n";

my @linec_delim  = split( /\s+/, $opt{'1'} // $lang21 );
my @blockc_delim = split( /\s+/, $opt{'2'} // $lang22 );
my @string_delim = split( /\s+/, $opt{'q'} // $lang2q );

my %rx_for = (
  'blockstart' => ( @blockc_delim     ? quotemeta( $blockc_delim[0] ) : '' ),
  'blockend'   => ( @blockc_delim > 0 ? quotemeta( $blockc_delim[1] ) : '' ),
  'linecstart' =>
   ( @linec_delim ? join( '|', map { quotemeta } @linec_delim ) : '' ),
  'newline' => '[\r\n]+',    # FreeRTOS has some funny line ends
  'quote' => ( @string_delim ? quotemeta( $string_delim[0] ) : '' ),
  'unquote' => ( @string_delim > 1 ? quotemeta( $string_delim[1] ) : '' ),
  'escaped'   => ( $opt{'e'} // $lang2e // '' ),    # escapes in strings
  'protected' => ( $opt{'p'} // $lang2p // '' )     # escapes outside strings
);

#warn "regex($_) = $rx_for{$_}\n" for keys %rx_for;

# while scanning the input, we can be in 4 $states:
#
#  - 'blockc': in a block comment
#  -  'linec': in an end-of-line comment
#  - 'string': in a string
#  -   'none': elsewhere
#
# start: none
# final: none
# transitions:
# none  -blockstart-> blockc
# blockc  -blockend-> none
# none  -linecstart-> linec
# linec    -newline-> none
# none       -quote-> string
# string   -unquote-> none
# none   -protected-> none
# string   -escaped-> string
#
# however, escaped and protected must be matched
# *before* other transitions, because their regular expressions
# may overlap with them and the order matters
# (the transitions aren't on single symbols!)

# the regexps to scan for in each state;
# note that escaped and protected are at the front
my %transitions = (
  'none'   => [qw(protected linecstart blockstart quote)],
  'blockc' => [qw(blockend)],
  'linec'  => [qw(newline)],
  'string' => [qw(escaped unquote)]
);

# transitions
my %next_state = (
  'none'   => [qw(none linec blockc string)],
  'blockc' => [qw(none)],
  'linec'  => [qw(none)],
  'string' => [qw(string none)]
);

# whether to output $`
my %output_skipped = (
  'none'   => 1,
  'blockc' => 0,
  'linec'  => 0,
  'string' => 1
);

# whether to output $&
my %output_matched = (
  'none'   => [qw(1 0 0 1)],
  'blockc' => [qw(0)],
  'linec'  => [qw(1)],
  'string' => [qw(1 1)]
);

# the matching process:

# we want to match for the transition markers in a single regular expression,
# in which they are alternatives, each captured with brackets
# so we can, after a match, return the number of the first matching alternative

sub compiled_alts
{
  my $alts = join( '|', map { "($_)" } @_ );
  qr/$alts/;
}

#my %rx_in = map { $_ => [ map { $rx_for{$_} } @{$transitions{$_}} ] } keys %transitions;
#my %crx_in = map { compile_alts(@{$rx_in{$_}}) } keys %rx_in;

# not good enough: we need to ensure progress
# even when an alternative matches with $` and $' both empty,
# which will happen, for instance, when it is itself empty,
# which, in turn, will happen when e.g. a transition is not defined for a language;
# so in such a case, we must rematch the remaining alternatives,
# and translate the alternative's index back to its index in the full array;

# we must never have *all* alternatives match trivially (never reach the BUG below)
# so add a fallback transition to each state that advances one character;
# but first *remove* all transitions that have no (i.e. an empty) matching expression
# because executing the fallback for them on every input character is very inefficient

foreach my $state ( keys %transitions )
{
  # remove transitions without match expressions
  my @tr = @{ $transitions{$state} };

  # a 0/1 vector for each transition of this state
  my @nonempty = map { $rx_for{ $tr[$_] } ne '' || 0 } 0 .. $#tr;

  if ( grep { $_ == 1 } @nonempty )
  {
    #warn scalar(@nonempty) . " transitions for $state remain\n";

    foreach my $hr ( \%transitions, \%next_state, \%output_matched )
    {
      $hr->{$state} =
       [ map { $nonempty[$_] ? $hr->{$state}->[$_] : () } 0 .. $#nonempty ];
    }
  }
  else
  {
    #warn "no transition for $state remains\n";

    foreach my $hr ( \%transitions, \%next_state, \%output_matched )
    {
      delete $hr->{$state};
    }
  }

}

#warn "$_ -> ", join(', ', @{$transitions{$_}}), "\n" for keys %transitions;

foreach my $state ( keys %transitions )
{
  # except the final state!
  next if $state eq 'none';
  # add a transition ...
  push( @{ $transitions{$state} }, 'skipchar' );
  # ... that advances one character by matching it ...
  $rx_for{skipchar} = '.';
  # ... but really skips it ...
  push( @{ $output_matched{$state} }, $output_skipped{$state} );
  # ... without changing the $state
  push( @{ $next_state{$state} }, $state );
}

sub suffix_lists
# given a list @a, returns the list ([@a], [@a[1..$#a]], [@a[2..$#a]], ... @a[$#a])
{
  map { [ @_[ $_ .. $#_ ] ] } 0 .. $#_;
}

sub compiled_remaining_alts
# given a list of regexps @rx, returns an array @cx of arrays of compiled regexps
# such that $cx[$i] is the compiled regexp to match the alternatives @rx[$i .. $#rx]
{
  map { compiled_alts(@$_) } suffix_lists(@_);
}

# consistently substitute the transition names $_ with their $rx_for{$_}
# but do not include empty expressions, we don't want to match them
my %rx_in = map {
  $_ => [ map { $rx_for{$_} } @{ $transitions{$_} } ]
} keys %transitions;

my %crx_in =
 map { $_ => [ compiled_remaining_alts( @{ $rx_in{$_} } ) ] } keys %rx_in;

process_all();
exit(0);

#--- auxiliary functions #
#

sub HELP_MESSAGE
{
  print STDERR <<ZZ;

Usage: uncomment [options] file [file2 [...]]]

to output the contents of the given files with comments removed.
Has an imperfect notion of quoting or escaping, so this may go wrong.

Options:

 -r

  to preserve all carriage return and newline characters
  even within removed comments, so as to keep the same line numbers
  on the remaining material

 -1 separators

  to supply a space-separated list of single-line comment separators;
  note that in a Unix shell it must be quoted; the default is '//'

 -2 brackets

  to supply a space-separated list of block comment bracket strings;
  note that in a Unix shell it must be quoted; the default is '/* */'

  At this moment only the first pair is used.

 -q quotes

  to supply a space-separated list of string quote and unquote symbols;
  note that in a Unix shell it must be quoted; the default is '" "'

  At this moment only the first pair is used.

 -e regexp

  to supply a Perl regular expression to recognize escaped material
  within strings; the default is '\\.'

 -p regexp

  to supply a Perl regular expression to recognize escaped material
  outside of strings or comments; for Perl, the default is [sm\$]#;

 -l language

  to supply a space-separated list of languages,
  used to initialize the -1, -2 and -p options;
  defaults to c

Examples:

  $0 -l perl $0
  $0 -2 '' -1 '#' -p '\$#|s#|m#' $0

  shoddily removes comments from this script:
    comments start with #, but \$#, s# and m# do not start a comment

ZZ

  exit(0);
}

sub VERSION_MESSAGE
{
  my $id = '$Id$';
  my ($v) = ( $id =~ /.{5}(.*).$/ );
  warn "Version: $v\n";
  exit(0);
}

sub process_all
{
  undef $/;    # slurp mode

  foreach my $f (@ARGV)
  {
    #warn "processing $f\n";

    if ( $f eq '-' )
    {
      process_lines(<STDIN>);
      close(STDIN);
    }
    elsif ( !-f $f )
    {
      warn "not a plain file, skipped: $f\n";
    }

    #elsif (!-r $f)
    #{
    #  warn "unreadable, skipped: $f\n";
    #  next;
    #}
    # not a good idea on Cygwin, where files may be !-r and still readable
    #
    elsif ( !open( F, '<', $f ) )
    {
      warn "cannot open $f, skipped\n";
    }
    else
    {
      process_lines(<F>);
      close(F);
    }
  }
}

sub process_lines
{
  @_ == 1
   or die "aborting, BUG: process() must be called with 1 argument\n";

  my $input = $_[0];

  my $state = 'none';
  my $caret = 0;

 MATCH:

  while ( my @crx = @{ $crx_in{$state} // [] } )
  {
    my $matchidx;

    foreach my $i ( 0 .. $#crx )
    {
      my $rx = $crx[$i];
      #warn "matching in $state after $i skipped alternatives, against $rx\n";
      my $m_a = ( firstidx { defined } ( substr( $input, $caret ) =~ /$rx/ ) );

      if ( $m_a == -1 )
      {
        #warn "no alternative matched\n";
        last MATCH;
      }
      elsif ( $` ne '' || $& ne '' )
      {
        #warn $transitions{$state}->[ $i + $m_a ], " expression in $rx matched: $&\n";
        $matchidx = $m_a + $i;

        #warn "match $matched induces transition $state -[$transitions{$state}->[$matchidx]]-> $next_state{$state}->[$matchidx]\n";# if $matchidx < $#{ $rx_in{$state} };
        print_if( $output_skipped{$state},              $` );
        print_if( $output_matched{$state}->[$matchidx], $& );
        $state = $next_state{$state}->[$matchidx];
        $caret += length($`) + length($&);
        last;
      }

      #warn $transitions{$state}->[$i+$m_a], " expression in $rx matched trivially\n";
    }

  }

  print_if( $output_skipped{$state}, substr( $input, $caret ) );
  warn "end of file in state $state\n" if $state ne 'none';
}

sub print_if
# if we want output, print it;
# if not, we may still want to print the newlines in the output
{
  my ( $want_output, $output ) = @_;

  if ($want_output)
  {
    print $output;
  }
  elsif ($always_print_newlines)
  {
    $output =~ s/[^\r\n]//g;
    print $output;
  }
}