#!/usr/bin/env perl # # selenium-linkchecker - check website links with Selenium # # $Id$ # see https://metacpan.org/pod/Selenium::Remote::Driver # works! (on winlqstux) but it is far too slow # first you must download and start the Selenium Server: e.g. # wget https://selenium-release.storage.googleapis.com/2.53/selenium-server-standalone-2.53.1.jar # java -jar selenium-server-standalone-2.53.1.jar use strict; use warnings; #use Selenium::Firefox; use Selenium::Remote::Driver; use Getopt::Long; my %opt = ( url => 'http://winlqswww/', user => 'admin', password => 'minda' ); GetOptions( \%opt, 'help', 'url:s', 'user:s', 'password:s' ) or die "$0: cannot parse command line: $@\n"; HELP_MESSAGE() if $opt{help}; my %nr_finds; my @urls_to_crawl = ( $opt{url} ); my ($scope_url) = $opt{url} =~ m#(.*/)#; #my $driver = Selenium::Firefox->new my $driver = Selenium::Remote::Driver->new or oomph('cannot connect to Selenium server'); ++$nr_finds{ $opt{url} }; crawl_and_report(); sleep(5); $driver->quit(); exit(0); #--- auxiliaries ---# # sub HELP_MESSAGE { print STDERR "Read the source code for help. Enjoy,\n"; exit(0); } sub oomph { die "$0: fatal error:", join( ' ', @_ ), " ($@)\n"; exit(1); } sub urls_in_scope { grep { substr( $_, 0, length($scope_url) ) eq $scope_url } @_; } sub crawl_and_report { while ( my $url = shift(@urls_to_crawl) ) { get($url) or next; report($url); if ( my $real_url = $driver->get_current_url() ) { ++$nr_finds{$real_url} if $url ne $real_url; } foreach my $url_to_crawl ( map { s/#.*//r } grep { /^http/ } urls_in_scope( a_hrefs() ) ) { push( @urls_to_crawl, $url_to_crawl ) if !$nr_finds{$url_to_crawl}; ++$nr_finds{$url_to_crawl}; printf( "URL occurrence number %d: %s\n", $nr_finds{$url_to_crawl}, $url_to_crawl ); } } } sub get { my ($url) = @_; my $success = 1; $driver->error_handler( sub { print "URL retrieval failed for $url: $_[0]\n"; $success = 0; } ); $driver->get(@_); $success; } sub report { my ($url) = @_; print "URL retrieved: $url i.e. ", $driver->get_current_url(), "\n"; } sub a_hrefs { my @urls = grep { defined($_) } map { eval { $_->get_attribute('href') } } dfs('//a[@href]'); # returns absolute URLs! see # https://github.com/seleniumhq/selenium-google-code-issue-archive/issues/1824 # the eval {} is required because get_attribute crashes # on http://www.win.tue.nl/bcf/ print "URL found: $_\n" for @urls; @urls; } sub df { eval { $driver->find_element(@_) }; } # catch exceptions sub dfs { eval { $driver->find_elements(@_) }; } sub dfc { eval { $driver->find_child_element(@_) }; } sub login # assuming we are not logged in, tries to log in # this is Sakai specific ... { df("//input[\@id='eid']")->send_keys( $opt{user} ) or oomph('cannot set user name'); df("//input[\@id='pw']")->send_keys( $opt{password} ) or oomph('cannot set password'); df("//input[\@id='submit']")->submit() or oomph('cannot submit login'); df("//a[\@title='Logout']") or oomph('cannot find logout button'); } #login(); sub select_outer_frame { $driver->switch_to_frame() or oomph("cannot switch to outer frame"); } sub select_inner_frame # need to do this e.g. after something is clicked in it { select_outer_frame(); # not always needed, but who cares $driver->switch_to_frame( df("//iframe") ) or oomph("cannot switch to inner frame"); } sub go_to_site # goes to the given site # no pre- or postcondition checking { my ($name) = @_; my $body = df("//div[\@class='portletBody']") or oomph("cannot find portlet body"); #my @site_as = dfs("//a"); # finds elements in the outer frame!? does it still do this? #my @site_as = $driver->find_child_elements($body, my @site_as = dfs("//td[\@headers='title']/h4/a[\@target='_top']") or die "cannot find any site links\n"; my ($site_a) = grep { $_->get_text() eq $name } @site_as or oomph("cannot find link to site '$name'"); $site_a->click() or oomph("cannot click link to site '$name'"); }