#!/usr/bin/env perl # # uncomment - remove or modify comments in source code # # $Id$ # # based on an earlier script, loc.pl # NOTE: not everything works, but I only need -l c++ for now use warnings; use strict; use Getopt::Std; use List::Util 'sum'; use List::MoreUtils 'firstidx'; my %opt; getopts( 'h?r1:2:e:l:p:q:', \%opt ) or exit(-1); my @lang = $opt{'l'} ? split( /\s+/, $opt{'l'} ) : ('c'); # languages (shortcuts) # pairs of strings: my %lang2q = ( 'c' => '" "', 'c++' => '" "', 'cs' => '" "', 'c#' => '" "', 'java' => '" "' ); # pairs of strings: my %lang22 = ( 'c' => '/* */', 'c++' => '/* */', 'cs' => '/* */', 'c#' => '/* */', 'java' => '/* */' ); # lists of strings: my %lang21 = ( 'c++' => '//', 'cs' => '//', 'vb' => "'", 'perl' => '#', 'sh' => '#' ); # a regexp: # \\\\ because one escape is for Perl, one for within the regexp my %lang2e = ( 'c++' => '\\\\.', 'cs' => '\\\\.', 'vb' => '""', 'perl' => '\\\\.', 'sh' => '\\\\.', ); # a regexp: my %lang2p = ( 'perl' => '[sm$]#' ); my %is_lang = map { $_ => 1 } map { keys %$_ } ( \%lang2q, \%lang22, \%lang21, \%lang2e ); if ( my @wronglang = grep { !defined $is_lang{$_} } @lang ) { my @rightlang = sort keys %is_lang; die 'supported values for -l are: ', join( ', ', @rightlang ), ', not: ', @wronglang, "\n"; } HELP_MESSAGE() if $opt{'h'} || $opt{'?'}; my $always_print_newlines = $opt{'r'}; @ARGV = ('-') if !@ARGV; my $lang21 = join( ' ', grep { $_ ne '' } map { $lang21{$_} // '' } @lang ); my $lang22 = join( ' ', grep { $_ ne '' } map { $lang22{$_} // '' } @lang ); my $lang2q = join( ' ', grep { $_ ne '' } map { $lang2q{$_} // '' } @lang ); my $lang2e = join( '|', grep { $_ ne '' } map { $lang2e{$_} // '' } @lang ); my $lang2p = join( '|', grep { $_ ne '' } map { $lang2p{$_} // '' } @lang ); # TO DO: support multiple bracket pairs #my @justopen = split( /\s+/, $opt{'1'} // $lang21 ); # 1-line separators #my %open2close = split( /\s+/, $opt{'2'} // $lang22 ); # brackets #my %quote2unquote = split( /\s+/, $opt{'q'} // $lang2q ); # quote pairs # #warn "comment brackets: ", join(' ', map { ($_, $open2close{$_}) } keys %open2close), "\n"; #warn "comment starters: ", join(' ', @justopen), "\n"; my @linec_delim = split( /\s+/, $opt{'1'} // $lang21 ); my @blockc_delim = split( /\s+/, $opt{'2'} // $lang22 ); my @string_delim = split( /\s+/, $opt{'q'} // $lang2q ); my %rx_for = ( 'blockstart' => ( @blockc_delim ? quotemeta( $blockc_delim[0] ) : '' ), 'blockend' => ( @blockc_delim > 0 ? quotemeta( $blockc_delim[1] ) : '' ), 'linecstart' => ( @linec_delim ? join( '|', map { quotemeta } @linec_delim ) : '' ), 'newline' => '[\r\n]+', # FreeRTOS has some funny line ends 'quote' => ( @string_delim ? quotemeta( $string_delim[0] ) : '' ), 'unquote' => ( @string_delim > 1 ? quotemeta( $string_delim[1] ) : '' ), 'escaped' => ( $opt{'e'} // $lang2e // '' ), # escapes in strings 'protected' => ( $opt{'p'} // $lang2p // '' ) # escapes outside strings ); #warn "regex($_) = $rx_for{$_}\n" for keys %rx_for; # while scanning the input, we can be in 4 $states: # # - 'blockc': in a block comment # - 'linec': in an end-of-line comment # - 'string': in a string # - 'none': elsewhere # # start: none # final: none # transitions: # none -blockstart-> blockc # blockc -blockend-> none # none -linecstart-> linec # linec -newline-> none # none -quote-> string # string -unquote-> none # none -protected-> none # string -escaped-> string # # however, escaped and protected must be matched # *before* other transitions, because their regular expressions # may overlap with them and the order matters # (the transitions aren't on single symbols!) # the regexps to scan for in each state; # note that escaped and protected are at the front my %transitions = ( 'none' => [qw(protected linecstart blockstart quote)], 'blockc' => [qw(blockend)], 'linec' => [qw(newline)], 'string' => [qw(escaped unquote)] ); # transitions my %next_state = ( 'none' => [qw(none linec blockc string)], 'blockc' => [qw(none)], 'linec' => [qw(none)], 'string' => [qw(string none)] ); # whether to output $` my %output_skipped = ( 'none' => 1, 'blockc' => 0, 'linec' => 0, 'string' => 1 ); # whether to output $& my %output_matched = ( 'none' => [qw(1 0 0 1)], 'blockc' => [qw(0)], 'linec' => [qw(1)], 'string' => [qw(1 1)] ); # the matching process: # we want to match for the transition markers in a single regular expression, # in which they are alternatives, each captured with brackets # so we can, after a match, return the number of the first matching alternative sub compiled_alts { my $alts = join( '|', map { "($_)" } @_ ); qr/$alts/; } #my %rx_in = map { $_ => [ map { $rx_for{$_} } @{$transitions{$_}} ] } keys %transitions; #my %crx_in = map { compile_alts(@{$rx_in{$_}}) } keys %rx_in; # not good enough: we need to ensure progress # even when an alternative matches with $` and $' both empty, # which will happen, for instance, when it is itself empty, # which, in turn, will happen when e.g. a transition is not defined for a language; # so in such a case, we must rematch the remaining alternatives, # and translate the alternative's index back to its index in the full array; # we must never have *all* alternatives match trivially (never reach the BUG below) # so add a fallback transition to each state that advances one character; # but first *remove* all transitions that have no (i.e. an empty) matching expression # because executing the fallback for them on every input character is very inefficient foreach my $state ( keys %transitions ) { # remove transitions without match expressions my @tr = @{ $transitions{$state} }; # a 0/1 vector for each transition of this state my @nonempty = map { $rx_for{ $tr[$_] } ne '' || 0 } 0 .. $#tr; if ( grep { $_ == 1 } @nonempty ) { #warn scalar(@nonempty) . " transitions for $state remain\n"; foreach my $hr ( \%transitions, \%next_state, \%output_matched ) { $hr->{$state} = [ map { $nonempty[$_] ? $hr->{$state}->[$_] : () } 0 .. $#nonempty ]; } } else { #warn "no transition for $state remains\n"; foreach my $hr ( \%transitions, \%next_state, \%output_matched ) { delete $hr->{$state}; } } } #warn "$_ -> ", join(', ', @{$transitions{$_}}), "\n" for keys %transitions; foreach my $state ( keys %transitions ) { # except the final state! next if $state eq 'none'; # add a transition ... push( @{ $transitions{$state} }, 'skipchar' ); # ... that advances one character by matching it ... $rx_for{skipchar} = '.'; # ... but really skips it ... push( @{ $output_matched{$state} }, $output_skipped{$state} ); # ... without changing the $state push( @{ $next_state{$state} }, $state ); } sub suffix_lists # given a list @a, returns the list ([@a], [@a[1..$#a]], [@a[2..$#a]], ... @a[$#a]) { map { [ @_[ $_ .. $#_ ] ] } 0 .. $#_; } sub compiled_remaining_alts # given a list of regexps @rx, returns an array @cx of arrays of compiled regexps # such that $cx[$i] is the compiled regexp to match the alternatives @rx[$i .. $#rx] { map { compiled_alts(@$_) } suffix_lists(@_); } # consistently substitute the transition names $_ with their $rx_for{$_} # but do not include empty expressions, we don't want to match them my %rx_in = map { $_ => [ map { $rx_for{$_} } @{ $transitions{$_} } ] } keys %transitions; my %crx_in = map { $_ => [ compiled_remaining_alts( @{ $rx_in{$_} } ) ] } keys %rx_in; process_all(); exit(0); #--- auxiliary functions # # sub HELP_MESSAGE { print STDERR <); close(STDIN); } elsif ( !-f $f ) { warn "not a plain file, skipped: $f\n"; } #elsif (!-r $f) #{ # warn "unreadable, skipped: $f\n"; # next; #} # not a good idea on Cygwin, where files may be !-r and still readable # elsif ( !open( F, '<', $f ) ) { warn "cannot open $f, skipped\n"; } else { process_lines(); close(F); } } } sub process_lines { @_ == 1 or die "aborting, BUG: process() must be called with 1 argument\n"; my $input = $_[0]; my $state = 'none'; my $caret = 0; MATCH: while ( my @crx = @{ $crx_in{$state} // [] } ) { my $matchidx; foreach my $i ( 0 .. $#crx ) { my $rx = $crx[$i]; #warn "matching in $state after $i skipped alternatives, against $rx\n"; my $m_a = ( firstidx { defined } ( substr( $input, $caret ) =~ /$rx/ ) ); if ( $m_a == -1 ) { #warn "no alternative matched\n"; last MATCH; } elsif ( $` ne '' || $& ne '' ) { #warn $transitions{$state}->[ $i + $m_a ], " expression in $rx matched: $&\n"; $matchidx = $m_a + $i; #warn "match $matched induces transition $state -[$transitions{$state}->[$matchidx]]-> $next_state{$state}->[$matchidx]\n";# if $matchidx < $#{ $rx_in{$state} }; print_if( $output_skipped{$state}, $` ); print_if( $output_matched{$state}->[$matchidx], $& ); $state = $next_state{$state}->[$matchidx]; $caret += length($`) + length($&); last; } #warn $transitions{$state}->[$i+$m_a], " expression in $rx matched trivially\n"; } } print_if( $output_skipped{$state}, substr( $input, $caret ) ); warn "end of file in state $state\n" if $state ne 'none'; } sub print_if # if we want output, print it; # if not, we may still want to print the newlines in the output { my ( $want_output, $output ) = @_; if ($want_output) { print $output; } elsif ($always_print_newlines) { $output =~ s/[^\r\n]//g; print $output; } }