# wordsinpara11 - iterates over all Balzac writings included in the Gutenberg Project
# and lists paragraphs that have a lot of words used to describe the physical
# appearance of characters, either their body or their clothes.
#
# This supplants wordsinpara9, this version has a more linear design,
# it builds the regex in a separate routine prior to matching, then collects a list
# of files to search in, then iterating over these files, it opens each file and
# reads it into a string in a separate routine, and then passes a reference to
# this stringto the actual search routine.
# this version changes the regex search loop in extract_paragraphs to make
# it simpler so that it can also be used to extract all paragraphs whether
# they have keywords or not. The loop creates one record per paragraph
# containing the within-file begin and end offsets for each paragraph along
# with a list of all keywords found in the paragraph.
# The decision on whether to extract the paragraph or not is deferred to the
# next block, in which all paragraphs with keyword counts within the chosen
# range are extracted.
use MakeRegex;
use Data::Dumper;
$Data::Dumper::Indent = 0;
#print "\n\nBalzacs Interiors\n\n";
print "\n\nDickens Food\n\n";
$upperlimit = 10000;
$lowerlimit = 4;
print "$lowerlimit < Keywords-In-Text < $upperlimit\n\n";
#$dir = "c:\\Balzac\\";
$dir = "c:\\dickens\\";
#print "directory: $dir\n\n";
#$titles_file = "balzactitles.tit";
$titles_file = "dickenstitles.tit";
$titles = init_titles($titles_file,$dir);
#print "titles: ",Dumper($titles),"\n";
#*******************************************
# keyword stuff
#$keywords_file = "interiors.wrd";
$keywords_file = "foodall.wrd";
$regex = build_regex($keywords_file,$dir);
#$regex_actual = '\n\s*\n'; # get all paragraphs
#$regex = \$regex_actual;
print "regex: $$regex\n";
#*************************************************
$bytesread_tot = 0;
# make a list of files to search
@tosearch = ();
opendir(DIR,$dir) or die "Can't open $dir: $! \n";
@files = readdir(DIR);
foreach $file (@files) {
if (-f "$dir\\$file") {
$tosearch = $dir . $file;
push(@tosearch,$tosearch);
}
}
closedir(DIR);
print "tosearch: ",Dumper(\@tosearch),"\n";
# search and collect info
foreach $file (@tosearch) {
$bigstring = file2string($file);
$paragraphs = extract_paragraphs($regex,$bigstring,$upperlimit,$lowerlimit,$titles);
foreach $paragraph (@$paragraphs) {
#print Dumper($paragraph), "\n\n";
print_paragraph($paragraph,$titles);
}
}
exit;
#**********************************************
# subroutines begin here
sub file2string {
my($text_file) = @_;
my($bigstring,$bytesread);
open(INFILE,$text_file) or die "Can't open file $text_file\n";
$bigstring = "";
$bytesread = read(INFILE,$bigstring,3000000);
print "bytes read: $bytesread\n";
$bytesread_tot += $bytesread;
return \$bigstring;
}
sub extract_paragraphs {
my($regex_ref,$bigstring_ref,$upperlimit,$lowerlimit,$titles) = @_;
my($infile,$bytesread,$toregex,$regex,@conc);
my($cur,$prev,$pos_prev,$pos_cur,@akeyword,@keywords,$item);
my($para,$offset,$len,$para_text,@record,$wordcount,$i);
my($record);
# iterate over paras in string/file
local($bigstring) = $$bigstring_ref;
$regex = $$regex_ref;
@conc = ();
$prev = '';
$pos_prev = 0;
$bigstring =~ /($regex)/gi;
$cur = $1;
@para = ();
@words = ();
$pos_prev = 0;
while ($pos_cur = pos($bigstring)) {
trimword($cur);
if (isparagraph($cur)) {
push(@conc,[$pos_prev,$pos_cur,@words]);
$pos_prev = $pos_cur;
@para = ();
@words = ();
} elsif (isword($cur)) {
push(@words,$cur);
}
$bigstring =~ /($regex)/gi;
$cur = $1;
}
#print "conc: ",Dumper(\@conc),"\n";
# if para has number of keywords in range, record it.
@keywords = ();
foreach $para (@conc) {
my($wordcount) = scalar(@$para) - 2;
if (($wordcount >= $lowerlimit) &&
($wordcount <= $upperlimit)) {
$record = record_para($para);
push(@keywords,[@$record]);
}
}
return \@keywords;
}
sub record_para {
my($para) = @_;
my($offset,$len,@record,$para_text,$wordcount);
$offset = $para->[0];
$len = $para->[1] - $para->[0];
$wordcount = scalar(@$para) - 2;
$para_text = substr($bigstring,$offset,$len);
@record = ($wordcount,$text_file,$offset,$len);
for ($i = 2; $i < $wordcount + 2; $i++) {
push(@record,$para->[$i]);
}
push(@record,$para_text);
return \@record;
}
sub stringfile2regex {
# turn strings in a text file into a regular expression
my($infile) = @_;
#print "infile: $infile\n";
open(INFILE,$infile) or die "Can't open $infile";
my(@list) = ();
while($line = ) {
chlomp($line);
push(@list,$line);
}
#print "\nkeywords: ",Dumper(\@list),"\n";
my($regex) = MakeRegex::make_regex(@list);
#print "\nkeywords-regex: $regex\n\n";
return $regex;
}
sub isnumber {
return($_[0] =~ /^\d+$/);
}
sub selectit {
my($cur,$after,$conc) = @_;
my($topush) = [$cur,$after];
push(@$conc,$topush);
}
sub trimword {
#remove excess characters before or after a word
chomp($_[0]);
$_[0] =~ s/^\W+//;
$_[0] =~ s/\W+$//;
}
sub isword {
if (length($_[0]) > 1) { return 1; } else { return 0; };
}
sub isparagraph {
if (length($_[0]) <= 1) { return 1; } else { return 0; };
}
sub chlomp {
#remove excess white space and the beginning and end of strings,
# and newlines at the end
chomp($_[0]);
$_[0] =~ s/^\s+//;
$_[0] =~ s/\s+$//;
}
sub init_titles {
my($filename,$dir) = @_;
my($path) = $dir . $filename;
my($line,%file2title,$file,$title);
open(INFILE,$path) or die "Can't open $path";
while ($line = ) {
#print "$line\n";
$line =~ /^\s*(\w+.txt)\s*(.+?)\[.*$/;
$file = $1;
$title = $2;
chlomp($title);
#print "file: $file title: $title \n";
$file2title{$file} = $title;
}
return \%file2title;
}
sub build_regex {
my($keywords_file,$dir) = @_;
my($toregex,$regex);
# build regex out of strings in string file
$toregex = $dir . $keywords_file;
$regex = stringfile2regex($toregex);
# add to regex to handle the space or punctuation preceding and following words
# and to handle blank lines that indicate para breaks
$regex = '\W+' . $regex . '\W+|\n\s*\n';
return \$regex;
}
sub print_paragraph {
my($para,$titles) = @_;
my($i,$text_pos);
my($filename) = $para->[1];
chlomp($filename);
$filename = lc($filename);
print "From: $titles->{$filename}\n";
print "(keyword count: $para->[0], file: $para->[1], offset: $para->[2], length: $para->[3])\n";
$text_pos = scalar(@$para) - 1;
print "[keywords: ";
for ($i = 4; $i < $text_pos; $i++ ) {
print "$para->[$i],";
}
print "]\n\n";
print "$para->[$text_pos]\n\n";
}
               (
geocities.com/soho/square)                   (
geocities.com/soho)