#!/usr/bin/perl # FILE: "/home/evmik/src/my_src/GradeBook/banner2csv.pl" # LAST MODIFICATION: "Fri, 17 Jan 2014 11:12:43 -0500 (evmik)" # (C) 2012 by Eugeniy Mikhailov, #use HTML::Entities; use HTML::Parser; use HTML::TableExtract; use Encode; #use HTML::TableExtract qw(tree); use Data::Dumper; #use strict; if ( @ARGV < 1 ) { print "Converts a WM banner summary list to csv file with students info\n"; print "\n"; print "Usage:\n"; print " " . $0 . "banner_html_file_to_parse\n"; print "\n"; print "Example:\n"; print " " . $0 . "banner_class_summary_list.html\n"; die; } my $html_file = shift; open (F_HTML, "<", $html_file) or die "Failed to read file $html_file : $!"; my @fcontent = ; $html_string=join('', @fcontent); # this look for the table based on its headers my $te = HTML::TableExtract->new( headers => [qw(Record Student ID)], keep_html => 1, slice_columns => 0 ); # find table with students info $te->parse($html_string); my $parsed_text=""; my $href=""; $aparser = HTML::Parser->new(api_version => 3, start_h => [\&a_start_handler, "self,tagname,attr"], report_tags => [qw(a img)], ); $spanparser = HTML::Parser->new(api_version => 3, start_h => [\&span_start_handler, "self,tagname,attr"], report_tags => [qw(span)], ); sub span_start_handler { my($self, $tag, $attr) = @_; return unless $tag eq "span"; $self->handler(text => [], '@{dtext}' ); $self->handler(start => \&span_start_handler); $self->handler(end => \&spantag_end_handler, "self,tagname"); } sub a_start_handler { my($self, $tag, $attr) = @_; return unless $tag eq "a"; return unless exists $attr->{href}; $href=$attr->{href}; #print "A $attr->{href}\n"; $self->handler(text => [], '@{dtext}' ); $self->handler(start => \&img_handler); $self->handler(end => \&atag_end_handler, "self,tagname"); } sub img_handler { my($self, $tag, $attr) = @_; return unless $tag eq "img"; push(@{$self->handler("text")}, $attr->{alt} || "[IMG]"); } sub atag_end_handler { my($self, $tag) = @_; my $text = encode('utf8',join("", @{$self->handler("text")})); $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; #print "T $text\n"; $parsed_text = $text; $self->handler("text", undef); $self->handler("start", \&a_start_handler); $self->handler("end", undef); } sub spantag_end_handler { my($self, $tag) = @_; my $text = encode('utf8',join("", @{$self->handler("text")})); $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; #print "T $text\n"; $parsed_text = $text; $self->handler("text", undef); $self->handler("start", \&span_start_handler); $self->handler("end", undef); } # assumes that we were able to chose a proper table before # and really get the proper one foreach $row ($te->rows) { #student name $aparser->parse(@$row[1]); $name=$parsed_text; ($lname, $fname)=split(',', $name); $fname =~ s/^ //; #student id #print @$row[2]; $spanparser->parse(@$row[2]); $sid=$parsed_text; #student name $aparser->parse(@$row[7]); $email = $href; $email =~ s/mailto://; # section $section = "unknown"; print join(',', $sid, $fname, $lname, $section, $email); print "\n"; }