aboutsummaryrefslogtreecommitdiff
path: root/banner2csv.pl
blob: bb3743624ec3a918fae5a40ad34eede8269ddb33 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/perl
# FILE: "/home/evmik/src/my_src/GradeBook/html_table2csv.pl"
# LAST MODIFICATION: "Thu, 30 Aug 2012 23:48:31 -0400 (evmik)"
# (C) 2012 by Eugeniy Mikhailov, <evgmik@gmail.com>

#use HTML::Entities;
use HTML::Parser;
use HTML::TableExtract;
use Encode;
#use HTML::TableExtract qw(tree);
use Data::Dumper;
#use strict;
my $html_file = shift;
open (F_HTML, "<", $html_file)  or  die "Failed to read file $html_file : $!";

my @fcontent = <F_HTML>;
$html_string=join('', @fcontent);


# this look for the table based on its headers
my $te = HTML::TableExtract->new(
	headers => [qw(Record Student ID)], 
	keep_html => 1, 
	slice_columns => 0
);
 
# find table with students info
$te->parse($html_string);

my $parsed_text="";
my $href="";

$aparser = HTML::Parser->new(api_version => 3,
     start_h => [\&a_start_handler, "self,tagname,attr"],
     report_tags => [qw(a img)],
    );

$spanparser = HTML::Parser->new(api_version => 3,
     start_h => [\&span_start_handler, "self,tagname,attr"],
     report_tags => [qw(span)],
    );

sub span_start_handler
{
    my($self, $tag, $attr) = @_;
    return unless $tag eq "span";

    $self->handler(text  => [], '@{dtext}' );
    $self->handler(start => \&span_start_handler);
    $self->handler(end   => \&spantag_end_handler, "self,tagname");
}

sub a_start_handler
{
    my($self, $tag, $attr) = @_;
    return unless $tag eq "a";
    return unless exists $attr->{href};
    $href=$attr->{href};
    #print "A $attr->{href}\n";

    $self->handler(text  => [], '@{dtext}' );
    $self->handler(start => \&img_handler);
    $self->handler(end   => \&atag_end_handler, "self,tagname");
}

sub img_handler
{
    my($self, $tag, $attr) = @_;
    return unless $tag eq "img";
    push(@{$self->handler("text")}, $attr->{alt} || "[IMG]");
}

sub atag_end_handler
{
    my($self, $tag) = @_;
    my $text = encode('utf8',join("", @{$self->handler("text")}));
    $text =~ s/^\s+//;
    $text =~ s/\s+$//;
    $text =~ s/\s+/ /g;
    #print "T $text\n";
    $parsed_text = $text;

    $self->handler("text", undef);
    $self->handler("start", \&a_start_handler);
    $self->handler("end", undef);
}

sub spantag_end_handler
{
    my($self, $tag) = @_;
    my $text = encode('utf8',join("", @{$self->handler("text")}));
    $text =~ s/^\s+//;
    $text =~ s/\s+$//;
    $text =~ s/\s+/ /g;
    #print "T $text\n";
    $parsed_text = $text;

    $self->handler("text", undef);
    $self->handler("start", \&span_start_handler);
    $self->handler("end", undef);
}


# Shorthand...top level rows() method assumes the first table found in
# the document if no arguments are supplied.
foreach $row ($te->rows) {
	#student name
	$aparser->parse(@$row[1]);
	$name=$parsed_text;

	#student id
	#print @$row[2];
	$spanparser->parse(@$row[2]);
	$sid=$parsed_text;

	#student name
	$aparser->parse(@$row[7]);
	$email = $href;
	$email =~ s/mailto://;


	print $name, "," , $sid, "," , $email;
	print "\n";
}