#!/usr/bin/env perl

# geturls - extract URLs from text

# bug: gets first URL on line only

# $Id$

use warnings;
use strict;

use Getopt::Std;

my %opt;

getopt( 'e:h:', \%opt );

die("Usage: $0 [-e pattern] [file ...]\n") if defined $opt{h};

$opt{e} =~ s|/|\\\/|g if defined $opt{e};
$opt{e} = '.' if !defined $opt{e};

my %url = ();    # none yet

# tokenize
# regard input as one string
#undef ($/);
#if (!(($_ = <>) && (@input = split)))
#{
#  warn("no input\n");
#}
#foreach (@input)
# the above action appears to read only a the first file,
# and slows processing down by a factor 6 on ~/News/new.www
# this may be related to the frequency of URLs in the input
# further speedup would be nice and possible
while (<>)
{
  if ( /$opt{e}/
    && /:\// )
  {
    while (
m#(https?|gopher|file|ftp|wais|mailto|finger|news|nntp|irc):[^\s,"')>]+\w[^\s,"')>]*#g
     )
    {
      # looks like a URL
      $url{$&} = 1;
    }
  }
}

map { print "$_\n" } sort keys %url;