diff options
-rwxr-xr-x | banner2csv.pl | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/banner2csv.pl b/banner2csv.pl new file mode 100755 index 0000000..bb37436 --- /dev/null +++ b/banner2csv.pl @@ -0,0 +1,125 @@ +#!/usr/bin/perl +# FILE: "/home/evmik/src/my_src/GradeBook/html_table2csv.pl" +# LAST MODIFICATION: "Thu, 30 Aug 2012 23:48:31 -0400 (evmik)" +# (C) 2012 by Eugeniy Mikhailov, <evgmik@gmail.com> + +#use HTML::Entities; +use HTML::Parser; +use HTML::TableExtract; +use Encode; +#use HTML::TableExtract qw(tree); +use Data::Dumper; +#use strict; +my $html_file = shift; +open (F_HTML, "<", $html_file) or die "Failed to read file $html_file : $!"; + +my @fcontent = <F_HTML>; +$html_string=join('', @fcontent); + + +# this look for the table based on its headers +my $te = HTML::TableExtract->new( + headers => [qw(Record Student ID)], + keep_html => 1, + slice_columns => 0 +); + +# find table with students info +$te->parse($html_string); + +my $parsed_text=""; +my $href=""; + +$aparser = HTML::Parser->new(api_version => 3, + start_h => [\&a_start_handler, "self,tagname,attr"], + report_tags => [qw(a img)], + ); + +$spanparser = HTML::Parser->new(api_version => 3, + start_h => [\&span_start_handler, "self,tagname,attr"], + report_tags => [qw(span)], + ); + +sub span_start_handler +{ + my($self, $tag, $attr) = @_; + return unless $tag eq "span"; + + $self->handler(text => [], '@{dtext}' ); + $self->handler(start => \&span_start_handler); + $self->handler(end => \&spantag_end_handler, "self,tagname"); +} + +sub a_start_handler +{ + my($self, $tag, $attr) = @_; + return unless $tag eq "a"; + return unless exists $attr->{href}; + $href=$attr->{href}; + #print "A $attr->{href}\n"; + + $self->handler(text => [], '@{dtext}' ); + $self->handler(start => \&img_handler); + $self->handler(end => \&atag_end_handler, "self,tagname"); +} + +sub img_handler +{ + my($self, $tag, $attr) = @_; + return unless $tag eq "img"; + push(@{$self->handler("text")}, $attr->{alt} || "[IMG]"); +} + +sub atag_end_handler +{ + my($self, $tag) = @_; + my $text = encode('utf8',join("", @{$self->handler("text")})); + $text =~ s/^\s+//; + $text =~ s/\s+$//; + $text =~ s/\s+/ /g; + #print "T $text\n"; + $parsed_text = $text; + + $self->handler("text", undef); + $self->handler("start", \&a_start_handler); + $self->handler("end", undef); +} + +sub spantag_end_handler +{ + my($self, $tag) = @_; + my $text = encode('utf8',join("", @{$self->handler("text")})); + $text =~ s/^\s+//; + $text =~ s/\s+$//; + $text =~ s/\s+/ /g; + #print "T $text\n"; + $parsed_text = $text; + + $self->handler("text", undef); + $self->handler("start", \&span_start_handler); + $self->handler("end", undef); +} + + +# Shorthand...top level rows() method assumes the first table found in +# the document if no arguments are supplied. +foreach $row ($te->rows) { + #student name + $aparser->parse(@$row[1]); + $name=$parsed_text; + + #student id + #print @$row[2]; + $spanparser->parse(@$row[2]); + $sid=$parsed_text; + + #student name + $aparser->parse(@$row[7]); + $email = $href; + $email =~ s/mailto://; + + + print $name, "," , $sid, "," , $email; + print "\n"; +} + |