#!/usr/bin/env perl
#
# deword-html - a simple demoronizer for HTML documents generated with Word 9
#
# $Id$
use strict;
use warnings;
use HTML::TreeBuilder;
#--- auxiliaries to improve HTML::Element ---#
#
sub HTML::Element::tree_as_HTML
{
my $self = shift;
if ( !ref $self )
{
return $self;
}
elsif ( !$self->isa('HTML::Element') )
{
return $self;
}
elsif ( !$self->isa('HTML::TreeBuilder') ) # hmmm
{
return $self->as_HTML(@_);
}
elsif ( $self->tag ne 'html' ) # ?!
{
return $self->as_HTML(@_);
}
# un-mess parse()'s handling of initial non-elements
my $result = '';
my $after_body = 0;
foreach my $x ( $self->content_list )
{
if ( !ref $x || !$x->isa('HTML::Element') )
{
# nothing
}
elsif ( $x->tag eq 'body' )
{
$after_body = 1;
}
elsif ( $after_body && $x->tag =~ /^~/ )
{
$result .= $x->as_HTML(@_);
$x->detach;
}
}
return $result . $self->HTML::Element::as_HTML(@_);
}
sub HTML::Element::insert_parent
# arguments: ($h, $offset, $length, $newparent)
# replaces the $length elements of $h at offset $offset with $newparent
# and adds them to its children (if any)
{
my ( $h, $offset, $length, $newparent ) = @_;
$newparent->push_content(
$h->splice_content( $offset, $length, $newparent ) );
}
#--- command line parse ---#
#
@ARGV
or @ARGV = ('-');
my $t = HTML::TreeBuilder->new;
$t->store_comments(1);
$t->store_declarations(1);
$t->store_pis(1);
$t->marked_sections(1); # doesn't work :(
$t->warn(0);
#--- process input documents ---#
#
foreach my $f (@ARGV)
{
#--- parse document $f ---#
#
$t->parse_file($f);
# this messes up declarations, such as DOCTYPE, at the start,
# putting them at the end of the children
# because the code assumes everything is under a single root
#
# we'll undo this when printing but we may of course do too much there
#--- remove everything from the
except the title ---#
#
my $head = $t->find_by_tag_name('head');
if ( defined $head )
{
my $title = $head->find_by_tag_name('title');
if ( defined $title )
{
$title = $title->clone;
}
else
{
$title = HTML::Element->new('title')->push_content("the document '$f'");
}
$head->delete_content;
$head->push_content($title);
}
#--- try to recuperate HTML structure from formatting ---#
# to allow a real roundtrip: Word -> $0 -> tidy #
# experimental: replace s, but it may
ize
# things that weren't
my @dd = $t->look_down(
sub {
$_[0]->tag eq 'p'
&& defined $_[0]->attr('style')
&& $_[0]->attr('style') =~ /^margin-left:/;
}
);
my %is_dd = map { $_ => 1 } @dd;
my @dt = $t->look_down(
sub {
$_[0]->tag eq 'p'
&& !$is_dd{ $_[0] }
&& defined( $_[0]->right )
&& $is_dd{ $_[0]->right };
}
);
my %is_dt = map { $_ => 1 } @dt;
my @first_dt = grep {
!defined( $_->left )
|| !( $is_dt{ $_->left } || $is_dd{ $_->left } )
} @dt;
foreach my $dt (@first_dt)
{
my @dx = ();
foreach ( my $dx = $dt ; defined $dx->right ; $dx = $dx->right )
{
if ( $is_dt{$dx} )
{
#warn "this " , $dx->tag, " is a - \n";
$dx->tag('dt');
push( @dx, $dx );
}
elsif ( $is_dd{$dx} )
{
#warn "this " , $dx->tag, " is a
- \n";
$dx->tag('dd');
push( @dx, $dx );
}
else
{
#warn "this " , $dx->tag, " isn't part of the
\n";
last;
}
}
$dt->parent->insert_parent( $dt->pindex, scalar(@dx),
HTML::Element->new('dl') );
next;
}
#--- remove all MS elements (o:*, v:*, and w:*) ---#
#
my @mselems = $t->look_down(
sub {
$_[0]->tag =~ /^[ovw]:/;
}
);
foreach my $mselem ( reverse @mselems ) # in postorder !
{
$mselem->replace_with_content->delete;
}
#--- remove all MS pragmas and (!--[endif]-->
#
my @mspragmas = $t->look_down(
sub {
$_[0]->tag eq '~comment' && $_[0]->attr("text") =~ /^\[(end)?if/;
}
);
foreach my $mspragma ( reverse @mspragmas ) # in postorder !
{
$mspragma->delete;
}
#--- remove all style and MS attributes from all elements ---#
#
# in addition, remove all HTML formatting attributes
# and attributes
$t->look_down(
sub {
foreach
my $attr (qw(style align valign height width link vlink type bgcolor))
{
$_[0]->attr( $attr, undef );
}
foreach my $attr ( $_[0]->all_external_attr_names )
{
if ( $attr =~ /^xmlns/
|| $attr =~ /^[ovw]:/
|| $_[0]->attr($attr) =~ /^Mso/ ) # matches class="MsoNormal"
{
$_[0]->attr( $attr, undef );
}
}
0; # continue!
}
);
#--- remove all marked sections (if/endif that aren't comments) ---#
#
$t->look_down(
sub {
foreach my $item_r ( $_[0]->content_refs_list )
{
next if ref $$item_r;
#$$item_r =~ s/\<!\[[^]]*\]\>//g;
$$item_r =~ s///g;
}
0; # continue!
}
);
#--- remove all s in text ---#
#
my $nbsp = $HTML::Entities::entity2char{'nbsp'};
$t->look_down(
sub {
foreach my $child ( $_[0]->content_refs_list )
{
next if ref $$child; # skip the elements
$$child =~ s/($nbsp)+/ /g;
}
0; # continue!
}
);
#--- remove certain elements, but not their content ---#
#
# you may wish to finetune this ...
#
map { $_->replace_with_content->delete } reverse $t->look_down(
# all , elements
# unless they have an id, name or class attribute
sub {
grep { lc( $_[0]->tag ) eq $_ }
( 'font', 'b', 'i', 'em', 'strong', 'br' );
}
);
#--- remove certain elements (not their content), unless anchors ---#
#
map { $_->replace_with_content->delete }
reverse $t->look_down( # in postorder!
# all , elements
# unless they have an id, name or class attribute
sub {
( grep { lc( $_[0]->tag ) eq $_ } ( 'span', 'class' ) )
&& ( !grep { defined( $_[0]->attr($_) ) } ( 'id', 'name', 'class' ) );
}
);
#--- remove all empty elements of certain types ---#
#
# this must be done recursively
#
while (
map { $_->replace_with_content->delete } reverse $t->look_down(
sub {
# a , or , or ...
(
grep { lc( $_[0]->tag ) eq $_ } (
'p', 'b', 'em', 'i', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'td', 'tr', 'table'
)
)
&&
# which is empty (i.e. without a non-
element or non-blank character)
( !grep { ref($_) ? lc( $_->tag ) ne 'br' && $_->tag !~ /^~/ : /\S/ }
$_[0]->content_list() );
}
)
)
{
}
#--- output the resulting document ---#
#
print $t->tree_as_HTML;
}