#!/usr/bin/env perl # # factoids - extract infobot factoids from an (ASCII) irc.log # # $Id$ use warnings; use strict; my %factoids = (); sub add { my ( $key, $value ) = @_; if ( exists $factoids{$key} ) # check for duplicates, but preserve order { foreach my $v ( @{ $factoids{$key} } ) { return if $v eq $value; } } push( @{ $factoids{$key} }, $value ); } while (<>) { if (/^[\d:]+ /) # timestamp { $_ = $'; } /^$/$1/; $nick =~ s/\/#.+//; $nick =~ s/_+$//; $nick =~ y/A-Z/a-z/; &add( $nick, $_ ); } else { # , in head? probably an aanspreking (sorry don't know the English word) $head =~ s/.*[,.!?:;] +//; $head =~ y/A-Z/a-z/; # remove surrounding non-alpha chars if ( $head =~ /\w.*\w/ && "$`$'" ne '' ) # 2nd condition is to preserve things like '1.9.0' { $head = $&; # if some more are found now, we're probably beyond recovery next if $head =~ /[^\w\s]/; } # remove some of the infobotisms and humanisms while ( $head =~ s/^((i (also |really |only |do )?)(guess|heard?|think|thought|believe|am sure|am afraid|am happy|found out|hope|know|read|figure|wonder|presume|assume|suppose|suspect|bet|swear|realized?|ask if)|are you sure|do you know|don't say|find out|ask(ing)? (me|him|her)|rumour has it|(it ?)?(seems|appears|shows|has been said|looks like|seems like|turns out)|probably|possibly|mostly|usually|normally|now|well|but|and|because|cause|since|sometimes|maybe|perhaps|meanwhile|just|although|even|fact|at ?least|anyone knows?|how come|(a+h|o+h|geez|wow|sorry|heh|hey|hehe|h?m+|ok|okay|so|say|like|yes|yeah?|no|nope|actually|apparently|exactly|btw|by the way|in fact|then again|damn|eg|for (eg|example|instance)),?)[.\s]+(that\s+)?// ) { } # an interesting special case: $head =~ s/^(i don'?t think|i('?| a)m not sure)\s+// and $tail = 'not ' . $tail; $head =~ s/^the\s+//; $tail =~ s/^probably\s+//; # another infobotism &add( $head, $tail ); } } warn sprintf( "%s factoids found in %s lines\n", scalar keys %factoids, $. ); foreach my $key ( sort keys %factoids ) { print $key, ' => ', join( ' | ', @{ $factoids{$key} } ), "\n"; }