# wordsinpara11 - iterates over all Balzac writings included in the Gutenberg Project
# and lists paragraphs that have a lot of words used to describe the physical 
# appearance of characters, either their body or their clothes.
#
# This supplants wordsinpara9, this version has a more linear design,
# it builds the regex in a separate routine prior to matching, then collects a list
# of files to search in, then iterating over these files, it opens each file and 
# reads it into a string in a separate routine, and then passes a reference to 
# this stringto the actual search routine. 

# this version changes the regex search loop in extract_paragraphs to make
# it simpler so that it can also be used to extract all paragraphs whether
# they have keywords or not. The loop creates one record per paragraph
# containing the within-file begin and end offsets for each paragraph along
# with a list of all keywords found in the paragraph. 
# The decision on whether to extract the paragraph or not is deferred to the 
# next block, in which all paragraphs with keyword counts within the chosen
# range are extracted.

use MakeRegex;

use Data::Dumper;
$Data::Dumper::Indent = 0;  

#print "\n\nBalzacs Interiors\n\n";
print "\n\nDickens Food\n\n";

$upperlimit = 10000;
$lowerlimit = 4;
print "$lowerlimit < Keywords-In-Text < $upperlimit\n\n";

#$dir = "c:\\Balzac\\";
$dir = "c:\\dickens\\";
#print "directory: $dir\n\n";

#$titles_file = "balzactitles.tit";
$titles_file = "dickenstitles.tit";
$titles = init_titles($titles_file,$dir);
#print "titles: ",Dumper($titles),"\n";

#*******************************************
# keyword stuff
#$keywords_file = "interiors.wrd";
$keywords_file = "foodall.wrd";
$regex = build_regex($keywords_file,$dir);

#$regex_actual = '\n\s*\n';   # get all paragraphs
#$regex = \$regex_actual;

print "regex: $$regex\n";
#*************************************************


$bytesread_tot = 0;

# make a list of files to search
@tosearch = ();
opendir(DIR,$dir) or die "Can't open $dir: $! \n";
@files = readdir(DIR);
foreach $file (@files) {
   if (-f "$dir\\$file") {
      $tosearch = $dir . $file;
      push(@tosearch,$tosearch);
   }
}
closedir(DIR);
print "tosearch: ",Dumper(\@tosearch),"\n";

# search and collect info
foreach $file (@tosearch) {
   $bigstring = file2string($file);
   $paragraphs = extract_paragraphs($regex,$bigstring,$upperlimit,$lowerlimit,$titles);
   foreach $paragraph (@$paragraphs) {
      #print Dumper($paragraph), "\n\n"; 
      print_paragraph($paragraph,$titles);
   }
}
exit;

#**********************************************
# subroutines begin here

sub file2string {
   my($text_file) = @_;
   my($bigstring,$bytesread);

   open(INFILE,$text_file) or die "Can't open file $text_file\n";  
   $bigstring = "";    
   $bytesread = read(INFILE,$bigstring,3000000);
   print "bytes read: $bytesread\n";
   $bytesread_tot += $bytesread;
   return \$bigstring;
}


sub extract_paragraphs {
   my($regex_ref,$bigstring_ref,$upperlimit,$lowerlimit,$titles) = @_;
   my($infile,$bytesread,$toregex,$regex,@conc);
   my($cur,$prev,$pos_prev,$pos_cur,@akeyword,@keywords,$item);
   my($para,$offset,$len,$para_text,@record,$wordcount,$i);
   my($record);

   # iterate over paras in string/file 
   local($bigstring) = $$bigstring_ref;
   $regex = $$regex_ref;
   @conc = ();
   $prev = '';
   $pos_prev = 0;
   $bigstring =~ /($regex)/gi;
   $cur = $1;
   @para = ();
   @words = ();
   $pos_prev = 0;
   while ($pos_cur = pos($bigstring)) {
      trimword($cur);

      if (isparagraph($cur)) {
         push(@conc,[$pos_prev,$pos_cur,@words]);
         $pos_prev = $pos_cur;
         @para  = (); 
         @words = ();
      } elsif (isword($cur)) { 
         push(@words,$cur);
      }

      $bigstring =~ /($regex)/gi;
      $cur = $1;
   }

   #print "conc: ",Dumper(\@conc),"\n";  

   # if para has number of keywords in range, record it.
   @keywords = ();
   foreach $para (@conc) {
      my($wordcount) = scalar(@$para) - 2;
      if (($wordcount >= $lowerlimit) &&
         ($wordcount <= $upperlimit)) {
            $record = record_para($para);
            push(@keywords,[@$record]);
      }
   }
   return \@keywords;
}

sub record_para {
   my($para) = @_;
   my($offset,$len,@record,$para_text,$wordcount);

   $offset    = $para->[0]; 
   $len       = $para->[1] - $para->[0];
   $wordcount = scalar(@$para) - 2;
   $para_text = substr($bigstring,$offset,$len);
   @record = ($wordcount,$text_file,$offset,$len);
   for ($i = 2; $i < $wordcount + 2; $i++) {
      push(@record,$para->[$i]);
   }
   push(@record,$para_text);
   return \@record;
}

sub stringfile2regex {
   # turn strings in a text file into a regular expression
   my($infile) = @_;
   #print "infile: $infile\n";
   open(INFILE,$infile) or die "Can't open $infile";
   my(@list) = ();
   while($line = ) {
      chlomp($line);
      push(@list,$line);
   }
   #print "\nkeywords: ",Dumper(\@list),"\n";
   my($regex) = MakeRegex::make_regex(@list);
   #print "\nkeywords-regex: $regex\n\n";
   return $regex;
}

sub isnumber {
   return($_[0] =~ /^\d+$/);
}

sub selectit {
   my($cur,$after,$conc) = @_; 
   my($topush) = [$cur,$after];
   push(@$conc,$topush);
}

sub trimword {
   #remove excess characters before or after a word 
   chomp($_[0]);
   $_[0] =~ s/^\W+//;
   $_[0] =~ s/\W+$//;
}

sub isword {
   if (length($_[0]) > 1) { return 1; } else { return 0; }; 
}

sub isparagraph {
   if (length($_[0]) <= 1) { return 1; } else { return 0; }; 
} 

sub chlomp {
   #remove excess white space and the beginning and end of strings,
   # and newlines at the end 
   chomp($_[0]);
   $_[0] =~ s/^\s+//;
   $_[0] =~ s/\s+$//;
}

sub init_titles {
   my($filename,$dir) = @_;
   my($path) = $dir . $filename;
   my($line,%file2title,$file,$title);
   open(INFILE,$path) or die "Can't open $path";
   while ($line = ) {
      #print "$line\n";
      $line =~ /^\s*(\w+.txt)\s*(.+?)\[.*$/;
      $file  = $1;
      $title = $2;
      chlomp($title);
      #print "file: $file title: $title \n";
      $file2title{$file} = $title;
   }
   return \%file2title;
}


sub build_regex {
   my($keywords_file,$dir) = @_;
   my($toregex,$regex);

   # build regex out of strings in string file
   $toregex = $dir . $keywords_file;
   $regex = stringfile2regex($toregex);

   # add to regex to handle the space or punctuation preceding and following words 
   # and to handle blank lines that indicate para breaks
   $regex = '\W+' . $regex . '\W+|\n\s*\n';
   return \$regex;
}

sub print_paragraph {
   my($para,$titles) = @_;
   my($i,$text_pos);
   my($filename) = $para->[1];
   chlomp($filename);
   $filename = lc($filename);
   print "From: $titles->{$filename}\n";
   print "(keyword count: $para->[0], file: $para->[1], offset: $para->[2], length: $para->[3])\n";
   $text_pos = scalar(@$para) - 1;
   print "[keywords: ";
   for ($i = 4; $i < $text_pos; $i++ ) {
      print "$para->[$i],";
   }
   print "]\n\n";
   print "$para->[$text_pos]\n\n";
}

    Source: geocities.com/soho/square/3472

               ( geocities.com/soho/square)                   ( geocities.com/soho)