#!/usr/bin/env perl # # wordgen - from a list of words, generate similar words # according to triple frequencies observed in the input # algoritme rammelt nog ... use warnings; use strict; my %count; my %nextfreq; # maps substrings to a hash mapping chars to frequencies; # begin and end of line are not indicated explicitly, but # short strings occur at the start of the sentence sub addword { my $word = shift(@_); ++$count{$word}; for ( my $i = 0 ; $i < length $word ; ++$i ) { # increment the frequency on the triple from $i-2 to $i my $head = substr( $word, 0, $i ); my $triplekey = ( length $head < 2 ) ? '' . $head : substr( $head, length($head) - 2 ); my $nextchar = ( 1 + $i eq length $word ) ? '' : substr( $word, $i, 1 ); #warn("incrementing '$triplekey:$nextchar'\n"); ++( $nextfreq{$triplekey}{$nextchar} ); } } sub weightedpick # given a ref to a hash of items with weights, picks an item { my $weightref = shift(@_); my %weight = %$weightref; return -1 if !%weight; my $sum = 0; foreach ( values %weight ) { $sum += $_; } my $pick = int rand $sum; my $sumsofar = 0; foreach my $item ( sort keys %weight ) { $sumsofar += $weight{$item}; if ( $pick < $sumsofar ) { #warn("weightedpick of ",join(',', map "('$_',$weight{$_})", sort keys %weight), " yields '$item'\n"); return $item; } } # huh? die "bug in weightedpick routine"; } #--- process words from input --- # while (<>) { foreach my $word ( split( /[^A-Za-z]+/, $_ ) ) { &addword("\L$word\E") if $word ne ''; } } sub completeword { my $word = shift(@_); #warn("completing '$word'\n"); my $wordtail = length $word < 2 ? $word : substr( $word, length($word) - 2 ); #warn("wordtail is '$wordtail'\n"); if ( !exists $nextfreq{$wordtail} ) { return undef; } my %options = %{ $nextfreq{$wordtail} }; while (%options) # there is still a value in %$nextfreq we haven't tried { my $nextchar = &weightedpick( $nextfreq{$wordtail} ); die "bug in completeword routine" if !defined $nextchar; if ( $nextchar eq '' ) { # end of word reached return $word; } if ( defined( my $result = &completeword( $word . $nextchar ) ) ) { return $result; } delete $options{$nextchar}; } } sub pickword { &completeword(''); } #my $sylnum = &weightedpick(\%sylnumcount); #warn("we want a word with $sylnum fragments\n"); while ( defined( my $word = &pickword ) ) { next if exists $count{$word}; print "$word\n"; &addword($word); }