# recur2 -
# makes a list of all recurring characters in the Comedie Humaine
# and the works they appear in.

# accumulates the lists at the end of each Project Gutenberg Balzac work into
# a master list and prints it out in the same format as the Project Gutenberg lists 

# Some little problems encountered along the way and repaired:
# 1. Names and titles with a quotation mark in it (cannot be in a dictionary key).
# 2. Title line first rather than name line, so title gets mistaken for a name.
# 3. Inserting notes after the beginning of the character list.

#-------------------------------------------------------------------------------
import string, re, glob

class Characters:
      def __init__(self,dirname):
          self.dirname = dirname
          self.filename = ""
          self.names = {}
          self.line = ""       
          self.name = ""
          self.accumulate_dir(dirname)

      def more_lines(self):    
          return (self.line != "")

      def start_addendum(self):
          return (string.find(self.line,"The following personage") != -1)    

      def goto_addendum(self,infile):
          self.line = infile.readline()
          while self.more_lines():
             self.line = infile.readline()
             #print "before addendum: ", self.line 
             if self.start_addendum():
                print "Addendum exists"
                self.line = infile.readline()
                return infile
          print "WARNING: No Addendum: ", self.filename
          return infile

      def set_character(self):
          # set self.name to character
          # if character does not exist then add it
          self.name = self.line
          #print "name: ", self.name
          if self.name and not self.names.has_key(self.name):
             self.names[self.name] = {}
             if self.name == "A Bachelor's Establishment": 
                   print "ERRORFILE: ", self.filename
             if self.name == "Letters of Two Brides":
                   print "ERRORFILE: ", self.filename
             if self.name == "Scenes from a Courtesan's Life":
                   print "ERRORFILE: ", self.filename
             if self.name == "The Imaginary Mistress":
                   print "ERRORFILE: ", self.filename
 
      def add_title(self):
          title = self.line
          if self.names.has_key(self.name):
             if string.find(title,"'"):
                title = re.sub("'",'',title)
             self.names[self.name][title] = 1
          else:
             print "ERROR: current name not present: ", self.name
 
      def isblankline(self):
          if not self.line:
             self.name_next  = 1
             self.title_next = 0
             return 1
          else:
             return 0

      def isname(self):
          if self.name_next and self.line:
             self.name_next  = 0
             self.title_next = 1
             return 1
          else:
             return 0

      def istitle(self):
          if self.title_next and self.line:
             self.name_next  = 0
             self.title_next = 1
             return 1
          else:
             return 0

      def accumulate_dir(self,dirname):
          # get all the files in the directory
          allfiles = dirname + '*.*'
          fileset = glob.glob(allfiles)
          # collect the recurring character lists at the end of each file
          for path in fileset:
              self.accumulate_file(path)
      
      def accumulate_file(self,filename):
          self.filename = filename
          print "\nfilename: ", filename
          infile = open(filename,'r')
          infile = self.goto_addendum(infile)
          #line = infile.readline()
          while self.more_lines():
             self.line = string.strip(self.line)
             #print "line:*%s*" % (self.line) 
             if self.isblankline():
                pass
             elif self.isname():
                self.set_character()
             elif self.istitle():
                self.add_title()
             self.line = infile.readline()
          infile.close()

      def print_histogram_titlesperchar(self,filename):
          histogram = []
          for name in self.names.keys():
              titles = self.names[name]
              count  = len(titles)
              tuple = (count,name)
              histogram.append(tuple)
              #histogram.append(len(self.names[name].keys()))
          histogram.sort()
 
          outfile = open(filename,'w')
          for tuple in histogram:
              outfile.write("%s, \"%s\",\n" % (tuple[0],tuple[1])) 
          outfile.close()
          
      def printer_sorted(self):
          keys = self.names.keys()
          keys.sort()
          for name in keys:
              print "\n%s" % (name)
              titles = self.names[name]
              for title in titles.keys():
                  print "   %s" % (title)

      def printer(self):
          print "==========================================================================="
          for name in self.names.keys():
              print "\n%s" % (name)
              titles = self.names[name]
              for title in titles.keys():
                  print "   %s" % (title)

class Character:
    def __init__(self,lastname,firstname,comment,titles):
        self.lastname  = lastname
        self.firstname = firstname
        self.comment   = comment 
        self.titles    = titles                 # list

#---------------------------------------------------------------

dirname = "c:\\balzac\\balzacbooksnofront\\"
chars = Characters(dirname)
chars.printer_sorted()

chars.print_histogram_titlesperchar("titlesperchar.txt")
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)