aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xbanner2csv.pl125
1 files changed, 125 insertions, 0 deletions
diff --git a/banner2csv.pl b/banner2csv.pl
new file mode 100755
index 0000000..bb37436
--- /dev/null
+++ b/banner2csv.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl
+# FILE: "/home/evmik/src/my_src/GradeBook/html_table2csv.pl"
+# LAST MODIFICATION: "Thu, 30 Aug 2012 23:48:31 -0400 (evmik)"
+# (C) 2012 by Eugeniy Mikhailov, <evgmik@gmail.com>
+
+#use HTML::Entities;
+use HTML::Parser;
+use HTML::TableExtract;
+use Encode;
+#use HTML::TableExtract qw(tree);
+use Data::Dumper;
+#use strict;
+my $html_file = shift;
+open (F_HTML, "<", $html_file) or die "Failed to read file $html_file : $!";
+
+my @fcontent = <F_HTML>;
+$html_string=join('', @fcontent);
+
+
+# this look for the table based on its headers
+my $te = HTML::TableExtract->new(
+ headers => [qw(Record Student ID)],
+ keep_html => 1,
+ slice_columns => 0
+);
+
+# find table with students info
+$te->parse($html_string);
+
+my $parsed_text="";
+my $href="";
+
+$aparser = HTML::Parser->new(api_version => 3,
+ start_h => [\&a_start_handler, "self,tagname,attr"],
+ report_tags => [qw(a img)],
+ );
+
+$spanparser = HTML::Parser->new(api_version => 3,
+ start_h => [\&span_start_handler, "self,tagname,attr"],
+ report_tags => [qw(span)],
+ );
+
+sub span_start_handler
+{
+ my($self, $tag, $attr) = @_;
+ return unless $tag eq "span";
+
+ $self->handler(text => [], '@{dtext}' );
+ $self->handler(start => \&span_start_handler);
+ $self->handler(end => \&spantag_end_handler, "self,tagname");
+}
+
+sub a_start_handler
+{
+ my($self, $tag, $attr) = @_;
+ return unless $tag eq "a";
+ return unless exists $attr->{href};
+ $href=$attr->{href};
+ #print "A $attr->{href}\n";
+
+ $self->handler(text => [], '@{dtext}' );
+ $self->handler(start => \&img_handler);
+ $self->handler(end => \&atag_end_handler, "self,tagname");
+}
+
+sub img_handler
+{
+ my($self, $tag, $attr) = @_;
+ return unless $tag eq "img";
+ push(@{$self->handler("text")}, $attr->{alt} || "[IMG]");
+}
+
+sub atag_end_handler
+{
+ my($self, $tag) = @_;
+ my $text = encode('utf8',join("", @{$self->handler("text")}));
+ $text =~ s/^\s+//;
+ $text =~ s/\s+$//;
+ $text =~ s/\s+/ /g;
+ #print "T $text\n";
+ $parsed_text = $text;
+
+ $self->handler("text", undef);
+ $self->handler("start", \&a_start_handler);
+ $self->handler("end", undef);
+}
+
+sub spantag_end_handler
+{
+ my($self, $tag) = @_;
+ my $text = encode('utf8',join("", @{$self->handler("text")}));
+ $text =~ s/^\s+//;
+ $text =~ s/\s+$//;
+ $text =~ s/\s+/ /g;
+ #print "T $text\n";
+ $parsed_text = $text;
+
+ $self->handler("text", undef);
+ $self->handler("start", \&span_start_handler);
+ $self->handler("end", undef);
+}
+
+
+# Shorthand...top level rows() method assumes the first table found in
+# the document if no arguments are supplied.
+foreach $row ($te->rows) {
+ #student name
+ $aparser->parse(@$row[1]);
+ $name=$parsed_text;
+
+ #student id
+ #print @$row[2];
+ $spanparser->parse(@$row[2]);
+ $sid=$parsed_text;
+
+ #student name
+ $aparser->parse(@$row[7]);
+ $email = $href;
+ $email =~ s/mailto://;
+
+
+ print $name, "," , $sid, "," , $email;
+ print "\n";
+}
+