package LexinSAXHandler; use base qw(XML::SAX::Base); my $print_char = 0; my $in_form = 0; my $in_pron = 0; my @forms = (); my @prons = (); sub start_document { my ($self, $doc) = @_; # process document start event } sub start_element { my ($self, $el) = @_; if ($el->{"Name"} eq "form") { $in_form = 1; push @forms, ""; } elsif ($el->{"Name"} eq "pronunciation") { $in_pron = 1; push @prons, ""; } } sub characters { my ($self, $el) = @_; if ($in_form) { push @forms, (pop @forms) . $el->{"Data"}; } elsif ($in_pron) { push @prons, (pop @prons) . $el->{"Data"}; } } sub end_element { my ($self, $el) = @_; $print_char = 0; if ($el->{"Name"} eq "lemma-entry") { foreach my $form (@forms) { # NOTE: discards alternative forms # such as 'alltsamman(s)' $form =~ s/\([^)]*\)//; # Remove numbers after forms $form =~ s/\s+\d$//; # Strip leading and trailing whitespace $form =~ s/^\s*//; $form =~ s/\s*$//; foreach my $pron (@prons) { # NOTE: discards alternative pronunciations # such as 'altihO:p(a)' and 'agresI:v (el. Ag:-)' $pron =~ s/\([^)]*\)//; # Strip leading and trailing whitespace $pron =~ s/^\s*//; $pron =~ s/\s*$//; printf("%-35s %s\n", $form, $pron); } } @forms = (); @prons = (); } elsif ($el->{"Name"} eq "form") { $in_form = 0; } elsif ($el->{"Name"} eq "pronunciation") { $in_pron = 0; } } 1;