# wordget3 - applies a stoplist which eliminates high-frequency (useless for query)
# words from the string before extracting the words.
use MakeRegex;
$regex = stoplist2regex("stop2.txt");
print "regex: $regex\n";
@text = ;
$text = join(" ",@text);
print "\n\ntext before: $text\n\n";
preprocess(\$text);
print "ALL WORDS:*******************************************\n";
($unique,$wdups,$count) = wordget(\$text);
print "unique: $unique, wdups: $wdups\n";
print "\nALL EXCEPT STOPLISTED WORDS**************************\n";
($unique,$wdups,$count) = wordget_stop(\$text,\$regex);
print "\n\nunique: $unique, wdups: $wdups\n";
# sort the hash by value, print out sorted hash
foreach $word (sort numerically keys %$count) {
print "$count->{$word} : $word\n";
}
exit;
sub preprocess {
my($text_ref) = @_;
$$text_ref =~ tr/A-Z/a-z/; # convert to lower case
$$text_ref =~ tr/.,:;!?"(){}'/ /d; # strip out punctuation
$$text_ref =~ s/--/ /g;
}
sub wordget_stop {
# extract all words except those on a stoplist from a string into a hash and count them
my($text_ref,$regex_ref) = @_;
my(%count,$word,$unique,$wdups,$regex);
$regex = $$regex_ref;
# get rid of the stoplisted words using the regex
$$text_ref =~ s/ / /g;
$$text_ref =~ s/\s$regex\s/ /gc;
print "text after: $$text_ref\n";
# split line into words
# count words, put count into hash (word is key, count is value)
foreach $word (split(" ",$$text_ref)) {
$count{$word}++;
}
$unique = scalar(keys(%count));
$wdups = 0;
foreach $word (keys %count) {
$wdups += $count{$word};
}
return ($unique,$wdups,\%count);
}
sub wordget {
# extract all words into a hash and count them
my($text_ref) = @_;
my(%count,$word,$unique,$wdups);
# split line into words
# count words, put count into hash (word is key, count is value)
foreach $word (split(" ",$$text_ref)) {
$count{$word}++;
}
$unique = scalar(keys(%count));
$wdups = 0;
foreach $word (keys %count) {
$wdups += $count{$word};
}
return ($unique,$wdups,\%count);
}
sub numerically {
# compare two words numerically in decreasing order
$count->{$b} <=> $count->{$a};
}
sub stoplist2regex {
my($stoplist) = @_;
my($regex,@list,$word);
open(STOP,$stoplist) or die "Can't open stoplist: $stoplist\n";
@list = ;
foreach $word (@list) {
#print "before:*$word*\n";
trimword($word);
#print "after:*$word*\n";
}
$regex = MakeRegex::make_regex(@list);
return $regex;
}
sub trimword {
#remove excess characters before or after a word
chomp($_[0]);
$_[0] =~ s/^\W+//;
$_[0] =~ s/\W+$//;
}
__DATA__
'Stop, stop!' said the old woman in a loud whisper. 'Will she be
buried to-morrow, or next day, or to-night? I laid her out; and
I must walk, you know. Send me a large cloak: a good warm one:
for it is bitter cold. We should have cake and wine, too, before
we go! Never mind; send some bread--only a loaf of bread and a
cup of water. Shall we have some bread, dear?' she said eagerly:
               (
geocities.com/soho/square)                   (
geocities.com/soho)