#!/usr/bin/env perl # # deword-html - a simple demoronizer for HTML documents generated with Word 9 # # $Id$ use strict; use warnings; use HTML::TreeBuilder; #--- auxiliaries to improve HTML::Element ---# # sub HTML::Element::tree_as_HTML { my $self = shift; if ( !ref $self ) { return $self; } elsif ( !$self->isa('HTML::Element') ) { return $self; } elsif ( !$self->isa('HTML::TreeBuilder') ) # hmmm { return $self->as_HTML(@_); } elsif ( $self->tag ne 'html' ) # ?! { return $self->as_HTML(@_); } # un-mess parse()'s handling of initial non-elements my $result = ''; my $after_body = 0; foreach my $x ( $self->content_list ) { if ( !ref $x || !$x->isa('HTML::Element') ) { # nothing } elsif ( $x->tag eq 'body' ) { $after_body = 1; } elsif ( $after_body && $x->tag =~ /^~/ ) { $result .= $x->as_HTML(@_); $x->detach; } } return $result . $self->HTML::Element::as_HTML(@_); } sub HTML::Element::insert_parent # arguments: ($h, $offset, $length, $newparent) # replaces the $length elements of $h at offset $offset with $newparent # and adds them to its children (if any) { my ( $h, $offset, $length, $newparent ) = @_; $newparent->push_content( $h->splice_content( $offset, $length, $newparent ) ); } #--- command line parse ---# # @ARGV or @ARGV = ('-'); my $t = HTML::TreeBuilder->new; $t->store_comments(1); $t->store_declarations(1); $t->store_pis(1); $t->marked_sections(1); # doesn't work :( $t->warn(0); #--- process input documents ---# # foreach my $f (@ARGV) { #--- parse document $f ---# # $t->parse_file($f); # this messes up declarations, such as DOCTYPE, at the start, # putting them at the end of the children # because the code assumes everything is under a single root # # we'll undo this when printing but we may of course do too much there #--- remove everything from the except the title ---# # my $head = $t->find_by_tag_name('head'); if ( defined $head ) { my $title = $head->find_by_tag_name('title'); if ( defined $title ) { $title = $title->clone; } else { $title = HTML::Element->new('title')->push_content("the document '$f'"); } $head->delete_content; $head->push_content($title); } #--- try to recuperate HTML structure from formatting ---# # to allow a real roundtrip: Word -> $0 -> tidy # # experimental: replace

s, but it may

ize # things that weren't my @dd = $t->look_down( sub { $_[0]->tag eq 'p' && defined $_[0]->attr('style') && $_[0]->attr('style') =~ /^margin-left:/; } ); my %is_dd = map { $_ => 1 } @dd; my @dt = $t->look_down( sub { $_[0]->tag eq 'p' && !$is_dd{ $_[0] } && defined( $_[0]->right ) && $is_dd{ $_[0]->right }; } ); my %is_dt = map { $_ => 1 } @dt; my @first_dt = grep { !defined( $_->left ) || !( $is_dt{ $_->left } || $is_dd{ $_->left } ) } @dt; foreach my $dt (@first_dt) { my @dx = (); foreach ( my $dx = $dt ; defined $dx->right ; $dx = $dx->right ) { if ( $is_dt{$dx} ) { #warn "this " , $dx->tag, " is a
\n"; $dx->tag('dt'); push( @dx, $dx ); } elsif ( $is_dd{$dx} ) { #warn "this " , $dx->tag, " is a
\n"; $dx->tag('dd'); push( @dx, $dx ); } else { #warn "this " , $dx->tag, " isn't part of the
\n"; last; } } $dt->parent->insert_parent( $dt->pindex, scalar(@dx), HTML::Element->new('dl') ); next; } #--- remove all MS elements (o:*, v:*, and w:*) ---# # my @mselems = $t->look_down( sub { $_[0]->tag =~ /^[ovw]:/; } ); foreach my $mselem ( reverse @mselems ) # in postorder ! { $mselem->replace_with_content->delete; } #--- remove all MS pragmas and (!--[endif]--> # my @mspragmas = $t->look_down( sub { $_[0]->tag eq '~comment' && $_[0]->attr("text") =~ /^\[(end)?if/; } ); foreach my $mspragma ( reverse @mspragmas ) # in postorder ! { $mspragma->delete; } #--- remove all style and MS attributes from all elements ---# # # in addition, remove all HTML formatting attributes # and
    attributes $t->look_down( sub { foreach my $attr (qw(style align valign height width link vlink type bgcolor)) { $_[0]->attr( $attr, undef ); } foreach my $attr ( $_[0]->all_external_attr_names ) { if ( $attr =~ /^xmlns/ || $attr =~ /^[ovw]:/ || $_[0]->attr($attr) =~ /^Mso/ ) # matches class="MsoNormal" { $_[0]->attr( $attr, undef ); } } 0; # continue! } ); #--- remove all marked sections (if/endif that aren't comments) ---# # $t->look_down( sub { foreach my $item_r ( $_[0]->content_refs_list ) { next if ref $$item_r; #$$item_r =~ s/\<!\[[^]]*\]\>//g; $$item_r =~ s///g; } 0; # continue! } ); #--- remove all  s in text ---# # my $nbsp = $HTML::Entities::entity2char{'nbsp'}; $t->look_down( sub { foreach my $child ( $_[0]->content_refs_list ) { next if ref $$child; # skip the elements $$child =~ s/($nbsp)+/ /g; } 0; # continue! } ); #--- remove certain elements, but not their content ---# # # you may wish to finetune this ... # map { $_->replace_with_content->delete } reverse $t->look_down( # all , elements # unless they have an id, name or class attribute sub { grep { lc( $_[0]->tag ) eq $_ } ( 'font', 'b', 'i', 'em', 'strong', 'br' ); } ); #--- remove certain elements (not their content), unless anchors ---# # map { $_->replace_with_content->delete } reverse $t->look_down( # in postorder! # all , elements # unless they have an id, name or class attribute sub { ( grep { lc( $_[0]->tag ) eq $_ } ( 'span', 'class' ) ) && ( !grep { defined( $_[0]->attr($_) ) } ( 'id', 'name', 'class' ) ); } ); #--- remove all empty elements of certain types ---# # # this must be done recursively # while ( map { $_->replace_with_content->delete } reverse $t->look_down( sub { # a

    , or , or ... ( grep { lc( $_[0]->tag ) eq $_ } ( 'p', 'b', 'em', 'i', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'tr', 'table' ) ) && # which is empty (i.e. without a non-
    element or non-blank character) ( !grep { ref($_) ? lc( $_->tag ) ne 'br' && $_->tag !~ /^~/ : /\S/ } $_[0]->content_list() ); } ) ) { } #--- output the resulting document ---# # print $t->tree_as_HTML; }