1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
#!/usr/bin/perl
# FILE: "/home/evmik/src/my_src/GradeBook/html_table2csv.pl"
# LAST MODIFICATION: "Thu, 30 Aug 2012 23:48:31 -0400 (evmik)"
# (C) 2012 by Eugeniy Mikhailov, <evgmik@gmail.com>
#use HTML::Entities;
use HTML::Parser;
use HTML::TableExtract;
use Encode;
#use HTML::TableExtract qw(tree);
use Data::Dumper;
#use strict;
my $html_file = shift;
open (F_HTML, "<", $html_file) or die "Failed to read file $html_file : $!";
my @fcontent = <F_HTML>;
$html_string=join('', @fcontent);
# this look for the table based on its headers
my $te = HTML::TableExtract->new(
headers => [qw(Record Student ID)],
keep_html => 1,
slice_columns => 0
);
# find table with students info
$te->parse($html_string);
my $parsed_text="";
my $href="";
$aparser = HTML::Parser->new(api_version => 3,
start_h => [\&a_start_handler, "self,tagname,attr"],
report_tags => [qw(a img)],
);
$spanparser = HTML::Parser->new(api_version => 3,
start_h => [\&span_start_handler, "self,tagname,attr"],
report_tags => [qw(span)],
);
sub span_start_handler
{
my($self, $tag, $attr) = @_;
return unless $tag eq "span";
$self->handler(text => [], '@{dtext}' );
$self->handler(start => \&span_start_handler);
$self->handler(end => \&spantag_end_handler, "self,tagname");
}
sub a_start_handler
{
my($self, $tag, $attr) = @_;
return unless $tag eq "a";
return unless exists $attr->{href};
$href=$attr->{href};
#print "A $attr->{href}\n";
$self->handler(text => [], '@{dtext}' );
$self->handler(start => \&img_handler);
$self->handler(end => \&atag_end_handler, "self,tagname");
}
sub img_handler
{
my($self, $tag, $attr) = @_;
return unless $tag eq "img";
push(@{$self->handler("text")}, $attr->{alt} || "[IMG]");
}
sub atag_end_handler
{
my($self, $tag) = @_;
my $text = encode('utf8',join("", @{$self->handler("text")}));
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\s+/ /g;
#print "T $text\n";
$parsed_text = $text;
$self->handler("text", undef);
$self->handler("start", \&a_start_handler);
$self->handler("end", undef);
}
sub spantag_end_handler
{
my($self, $tag) = @_;
my $text = encode('utf8',join("", @{$self->handler("text")}));
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\s+/ /g;
#print "T $text\n";
$parsed_text = $text;
$self->handler("text", undef);
$self->handler("start", \&span_start_handler);
$self->handler("end", undef);
}
# Shorthand...top level rows() method assumes the first table found in
# the document if no arguments are supplied.
foreach $row ($te->rows) {
#student name
$aparser->parse(@$row[1]);
$name=$parsed_text;
#student id
#print @$row[2];
$spanparser->parse(@$row[2]);
$sid=$parsed_text;
#student name
$aparser->parse(@$row[7]);
$email = $href;
$email =~ s/mailto://;
print $name, "," , $sid, "," , $email;
print "\n";
}
|