#!/usr/bin/env perl # geturls - extract URLs from text # bug: gets first URL on line only # $Id$ use warnings; use strict; use Getopt::Std; my %opt; getopt( 'e:h:', \%opt ); die("Usage: $0 [-e pattern] [file ...]\n") if defined $opt{h}; $opt{e} =~ s|/|\\\/|g if defined $opt{e}; $opt{e} = '.' if !defined $opt{e}; my %url = (); # none yet # tokenize # regard input as one string #undef ($/); #if (!(($_ = <>) && (@input = split))) #{ # warn("no input\n"); #} #foreach (@input) # the above action appears to read only a the first file, # and slows processing down by a factor 6 on ~/News/new.www # this may be related to the frequency of URLs in the input # further speedup would be nice and possible while (<>) { if ( /$opt{e}/ && /:\// ) { while ( m#(https?|gopher|file|ftp|wais|mailto|finger|news|nntp|irc):[^\s,"')>]+\w[^\s,"')>]*#g ) { # looks like a URL $url{$&} = 1; } } } map { print "$_\n" } sort keys %url;