#!/usr/bin/env perl
#
# deword-html - a simple demoronizer for HTML documents generated with Word 9
#
# $Id$

use strict;
use warnings;

use HTML::TreeBuilder;

#--- auxiliaries to improve HTML::Element ---#
#

sub HTML::Element::tree_as_HTML
{
  my $self = shift;

  if ( !ref $self )
  {
    return $self;
  }
  elsif ( !$self->isa('HTML::Element') )
  {
    return $self;
  }
  elsif ( !$self->isa('HTML::TreeBuilder') )    # hmmm
  {
    return $self->as_HTML(@_);
  }
  elsif ( $self->tag ne 'html' )                # ?!
  {
    return $self->as_HTML(@_);
  }

  # un-mess parse()'s handling of initial non-elements

  my $result     = '';
  my $after_body = 0;

  foreach my $x ( $self->content_list )
  {
    if ( !ref $x || !$x->isa('HTML::Element') )
    {
      # nothing
    }
    elsif ( $x->tag eq 'body' )
    {
      $after_body = 1;
    }
    elsif ( $after_body && $x->tag =~ /^~/ )
    {
      $result .= $x->as_HTML(@_);
      $x->detach;
    }
  }

  return $result . $self->HTML::Element::as_HTML(@_);
}

sub HTML::Element::insert_parent
# arguments: ($h, $offset, $length, $newparent)
# replaces the $length elements of $h at offset $offset with $newparent
# and adds them to its children (if any)
{
  my ( $h, $offset, $length, $newparent ) = @_;
  $newparent->push_content(
    $h->splice_content( $offset, $length, $newparent ) );
}

#--- command line parse ---#
#

@ARGV
 or @ARGV = ('-');

my $t = HTML::TreeBuilder->new;
$t->store_comments(1);
$t->store_declarations(1);
$t->store_pis(1);
$t->marked_sections(1);    # doesn't work :(
$t->warn(0);

#--- process input documents ---#
#

foreach my $f (@ARGV)
{
  #--- parse document $f ---#
  #

  $t->parse_file($f);

  # this messes up declarations, such as DOCTYPE, at the start,
  # putting them at the end of the <html> children
  # because the code assumes everything is under a single <html> root
  #
  # we'll undo this when printing but we may of course do too much there

  #--- remove everything from the <head> except the title ---#
  #

  my $head = $t->find_by_tag_name('head');
  if ( defined $head )
  {
    my $title = $head->find_by_tag_name('title');

    if ( defined $title )
    {
      $title = $title->clone;
    }
    else
    {
      $title = HTML::Element->new('title')->push_content("the document '$f'");
    }
    $head->delete_content;
    $head->push_content($title);
  }

  #--- try to recuperate HTML structure from formatting ---#
  #    to allow a real roundtrip: Word -> $0 -> tidy       #

  # experimental: replace <p style='margin-left:...> with <dd>,
  # left sibling <p>s with <dt>, and wrap them in <dl>
  # this undoes Word's destruction of <dl>s, but it may <dl>ize
  # things that weren't

  my @dd = $t->look_down(
    sub {
      $_[0]->tag eq 'p'
       && defined $_[0]->attr('style')
       && $_[0]->attr('style') =~ /^margin-left:/;
    }
  );

  my %is_dd = map { $_ => 1 } @dd;

  my @dt = $t->look_down(
    sub {
      $_[0]->tag eq 'p'
       && !$is_dd{ $_[0] }
       && defined( $_[0]->right )
       && $is_dd{ $_[0]->right };
    }
  );

  my %is_dt = map { $_ => 1 } @dt;

  my @first_dt = grep {
        !defined( $_->left )
     || !( $is_dt{ $_->left } || $is_dd{ $_->left } )
  } @dt;

  foreach my $dt (@first_dt)
  {
    my @dx = ();

    foreach ( my $dx = $dt ; defined $dx->right ; $dx = $dx->right )
    {
      if ( $is_dt{$dx} )
      {
        #warn "this " , $dx->tag, " is a <dt>\n";
        $dx->tag('dt');
        push( @dx, $dx );
      }
      elsif ( $is_dd{$dx} )
      {
        #warn "this " , $dx->tag, " is a <dd>\n";
        $dx->tag('dd');
        push( @dx, $dx );
      }
      else
      {
        #warn "this " , $dx->tag, " isn't part of the <dl>\n";
        last;
      }
    }

    $dt->parent->insert_parent( $dt->pindex, scalar(@dx),
      HTML::Element->new('dl') );
    next;
  }

  #--- remove all MS elements (o:*, v:*, and w:*) ---#
  #

  my @mselems = $t->look_down(
    sub {
      $_[0]->tag =~ /^[ovw]:/;
    }
  );

  foreach my $mselem ( reverse @mselems )    # in postorder !
  {
    $mselem->replace_with_content->delete;
  }

  #--- remove all MS pragmas <!--[if (...) ]--> and (!--[endif]-->
  #

  my @mspragmas = $t->look_down(
    sub {
      $_[0]->tag eq '~comment' && $_[0]->attr("text") =~ /^\[(end)?if/;
    }
  );

  foreach my $mspragma ( reverse @mspragmas )    # in postorder !
  {
    $mspragma->delete;
  }

  #--- remove all style and MS attributes from all elements ---#
  #

  # in addition, remove all HTML formatting attributes
  # and <ul type> attributes

  $t->look_down(
    sub {
      foreach
       my $attr (qw(style align valign height width link vlink type bgcolor))
      {
        $_[0]->attr( $attr, undef );
      }

      foreach my $attr ( $_[0]->all_external_attr_names )
      {
        if ( $attr =~ /^xmlns/
          || $attr =~ /^[ovw]:/
          || $_[0]->attr($attr) =~ /^Mso/ )    # matches class="MsoNormal"
        {
          $_[0]->attr( $attr, undef );
        }
      }
      0;                                       # continue!
    }
  );

  #--- remove all marked sections (if/endif that aren't comments)  ---#
  #

  $t->look_down(
    sub {
      foreach my $item_r ( $_[0]->content_refs_list )
      {
        next if ref $$item_r;

        #$$item_r =~ s/\&lt;!\[[^]]*\]\&gt;//g;
        $$item_r =~ s/<!\[[^]]*\]>//g;
      }
      0;    # continue!
    }
  );

  #--- remove all &nbsp;s in text ---#
  #

  my $nbsp = $HTML::Entities::entity2char{'nbsp'};

  $t->look_down(
    sub {
      foreach my $child ( $_[0]->content_refs_list )
      {
        next if ref $$child;    # skip the elements
        $$child =~ s/($nbsp)+/ /g;
      }
      0;                        # continue!
    }
  );

  #--- remove certain elements, but not their content ---#
  #
  # you may wish to finetune this ...
  #
  map { $_->replace_with_content->delete } reverse $t->look_down(

    # all <span>, <class> elements
    # unless they have an id, name or class attribute
    sub {
      grep { lc( $_[0]->tag ) eq $_ }
       ( 'font', 'b', 'i', 'em', 'strong', 'br' );
    }
  );

  #--- remove certain elements (not their content), unless anchors ---#
  #
  map { $_->replace_with_content->delete }
   reverse $t->look_down(    # in postorder!
    # all <span>, <class> elements
    # unless they have an id, name or class attribute
    sub {
      ( grep { lc( $_[0]->tag ) eq $_ } ( 'span', 'class' ) )
       && ( !grep { defined( $_[0]->attr($_) ) } ( 'id', 'name', 'class' ) );
    }
   );

  #--- remove all empty elements of certain types ---#
  #
  # this must be done recursively
  #
  while (
    map { $_->replace_with_content->delete } reverse $t->look_down(
      sub {
        # a <p>, or <b>, or ...
        (
          grep { lc( $_[0]->tag ) eq $_ } (
            'p',  'b',  'em', 'i',  'h1', 'h2', 'h3', 'h4',
            'h5', 'h6', 'td', 'tr', 'table'
          )
         )
         &&
         # which is empty (i.e. without a non-<br> element or non-blank character)
         ( !grep { ref($_) ? lc( $_->tag ) ne 'br' && $_->tag !~ /^~/ : /\S/ }
          $_[0]->content_list() );
      }
    )
   )
  {
  }

  #--- output the resulting document ---#
  #

  print $t->tree_as_HTML;
}