#!/usr/bin/env perl # # shortest - preserve only line of which no proper prefix is also a line # # $Id: shortest 703 2010-02-04 15:22:27Z rp $ # # optimized to avoid unnecessary sorting # still sorts "too much" on nonsorted input; # using prefix trees or similar will be more efficient in some cases use warnings; use strict; use Getopt::Std; my %opt; getopts( 'hve:r:', \%opt ); &help if $opt{h}; my $prefixfilter = $opt{e}; my $restfilter = $opt{r}; sub help { print STDERR <; chomp for @line; my %firstline = map { $line[$_] => $_ } reverse 0 .. $#line; my @sorted = sort keys %firstline; # on sorted input, we know all prefixes of a string occur before that string my $prefix; # the longest current prefix, if any my @pl; # the lengths of valid prefixes of $prefix in the input foreach (@sorted) { my $fd = @pl ? firstdiff( $prefix, $_ ) : 0; if (@pl) { # we have a $prefix to check against # try to validly match a prefix foreach my $l (@pl) { if ( $fd >= $l && ( !defined $restfilter || substr( $_, $l ) =~ $restfilter ) ) { # we have a valid match against substr($_, 0, $l) &$ehm( "$_ matches " . substr( $_, 0, $l ) . ", omitted" ); delete $firstline{$_}; last; } elsif ( $fd >= $l ) { &$ehm( "$_ matches " . substr( $_, 0, $l ) . ", but " . substr( $_, $l ) . " does not match $restfilter" ); } else { &$ehm("$_ only matches up to $fd, not to $l"); last; } } } if ( !defined $prefixfilter || /$prefixfilter/ ) { # $_ is valid as a prefix to check against # set it as the new $prefix $prefix = $_; my $l = length($_); while ( @pl && $pl[$#pl] > $fd ) { pop(@pl); } push( @pl, $l ) if $l > $fd; #warn '0..', join( ',', @pl ), " of $_\n"; } } sub firstdiff # yields the index of the first character that differs between the arguments { my ( $s, $t ) = @_; my $minl = length($s) < length($t) ? length($s) : length($t); foreach my $i ( 0 .. $minl ) { if ( substr( $s, $i, 1 ) ne substr( $t, $i, 1 ) ) { return $i; } } return $minl + 1; } print $_, $/ for grep { defined $firstline{$_} } @line;