# recur2 -
# makes a list of all recurring characters in the Comedie Humaine
# and the works they appear in.
# accumulates the lists at the end of each Project Gutenberg Balzac work into
# a master list and prints it out in the same format as the Project Gutenberg lists
# Some little problems encountered along the way and repaired:
# 1. Names and titles with a quotation mark in it (cannot be in a dictionary key).
# 2. Title line first rather than name line, so title gets mistaken for a name.
# 3. Inserting notes after the beginning of the character list.
#-------------------------------------------------------------------------------
import string, re, glob
class Characters:
def __init__(self,dirname):
self.dirname = dirname
self.filename = ""
self.names = {}
self.line = ""
self.name = ""
self.accumulate_dir(dirname)
def more_lines(self):
return (self.line != "")
def start_addendum(self):
return (string.find(self.line,"The following personage") != -1)
def goto_addendum(self,infile):
self.line = infile.readline()
while self.more_lines():
self.line = infile.readline()
#print "before addendum: ", self.line
if self.start_addendum():
print "Addendum exists"
self.line = infile.readline()
return infile
print "WARNING: No Addendum: ", self.filename
return infile
def set_character(self):
# set self.name to character
# if character does not exist then add it
self.name = self.line
#print "name: ", self.name
if self.name and not self.names.has_key(self.name):
self.names[self.name] = {}
if self.name == "A Bachelor's Establishment":
print "ERRORFILE: ", self.filename
if self.name == "Letters of Two Brides":
print "ERRORFILE: ", self.filename
if self.name == "Scenes from a Courtesan's Life":
print "ERRORFILE: ", self.filename
if self.name == "The Imaginary Mistress":
print "ERRORFILE: ", self.filename
def add_title(self):
title = self.line
if self.names.has_key(self.name):
if string.find(title,"'"):
title = re.sub("'",'',title)
self.names[self.name][title] = 1
else:
print "ERROR: current name not present: ", self.name
def isblankline(self):
if not self.line:
self.name_next = 1
self.title_next = 0
return 1
else:
return 0
def isname(self):
if self.name_next and self.line:
self.name_next = 0
self.title_next = 1
return 1
else:
return 0
def istitle(self):
if self.title_next and self.line:
self.name_next = 0
self.title_next = 1
return 1
else:
return 0
def accumulate_dir(self,dirname):
# get all the files in the directory
allfiles = dirname + '*.*'
fileset = glob.glob(allfiles)
# collect the recurring character lists at the end of each file
for path in fileset:
self.accumulate_file(path)
def accumulate_file(self,filename):
self.filename = filename
print "\nfilename: ", filename
infile = open(filename,'r')
infile = self.goto_addendum(infile)
#line = infile.readline()
while self.more_lines():
self.line = string.strip(self.line)
#print "line:*%s*" % (self.line)
if self.isblankline():
pass
elif self.isname():
self.set_character()
elif self.istitle():
self.add_title()
self.line = infile.readline()
infile.close()
def print_histogram_titlesperchar(self,filename):
histogram = []
for name in self.names.keys():
titles = self.names[name]
count = len(titles)
tuple = (count,name)
histogram.append(tuple)
#histogram.append(len(self.names[name].keys()))
histogram.sort()
outfile = open(filename,'w')
for tuple in histogram:
outfile.write("%s, \"%s\",\n" % (tuple[0],tuple[1]))
outfile.close()
def printer_sorted(self):
keys = self.names.keys()
keys.sort()
for name in keys:
print "\n%s" % (name)
titles = self.names[name]
for title in titles.keys():
print " %s" % (title)
def printer(self):
print "==========================================================================="
for name in self.names.keys():
print "\n%s" % (name)
titles = self.names[name]
for title in titles.keys():
print " %s" % (title)
class Character:
def __init__(self,lastname,firstname,comment,titles):
self.lastname = lastname
self.firstname = firstname
self.comment = comment
self.titles = titles # list
#---------------------------------------------------------------
dirname = "c:\\balzac\\balzacbooksnofront\\"
chars = Characters(dirname)
chars.printer_sorted()
chars.print_histogram_titlesperchar("titlesperchar.txt")
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|