# countrefs1 -
# count the number of references to a given word or phrase in
# a Balzac title.
#C:\Python20>python countrefs1.txt >countrefs1out.txt
#========================================================================
# What the program does in words:
# For all lines in the reference file.
# Line containing a reference if it begins with a file reference,
# recognized/identified by the regex for a file path.
# If the line contains a reference the next step is to strip
# off the file name without the ".txt" suffix that gives
# the Project Gutenberg title identifier for the Balzac work (e.g. brcrc10).
# Next discard the reference if it is part of the character
# repertory or introduction and not a story: (1rthc10,2rthc10).
# Next, if the title already has a count associated with it
# increment the count, otherwise create a counter and set it to 1.
# (also note although I used the grep/search in EditPlus
# (Microsoft VC++ IDE would also work) it would be a good
# idea to rewrite this grep in an independent stand-alone
# utility program that could be integrated into other programs)
#=========================================================================
import re, string, sys
def loadtable_balzactitles():
tablefilename = "balzactitles.txt"
infile = open(tablefilename,"r")
tuples = []
for line in infile.readlines():
m = re.match('\s*(\w+10)\.txt(.+)',line)
g = m.groups()
#print g
tuples.append(g)
infile.close()
table = {}
for tuple in tuples:
key = string.strip(tuple[0])
value = string.strip(tuple[1])
table[key] = value
return table
def descending(tuple1,tuple2):
return cmp(tuple2[1],tuple1[1])
def delete_if_present(dict,key):
if dict.has_key(key):
del dict[key]
def count_references(filename,name):
# count the references to a title
titles = loadtable_balzactitles()
re_path = re.compile('\w+:',re.IGNORECASE) # change this to a full path recognizer
#re_title = re.compile('(\w+10)\.txt',re.IGNORECASE)
re_title = re.compile('(\w\w\w\w\w10)\.txt',re.IGNORECASE)
refs = {}
infile = open(filename, 'r')
for line in infile.readlines():
#print "line: ", line
is_reference = re_path.match(line)
if is_reference:
is_title = re_title.search(line)
m = is_title.groups()
title = m[0]
title = title.strip()
if not refs.has_key(title):
refs[title] = 1
else:
refs[title] = refs[title] + 1
infile.close()
delete_if_present(refs,'1rthc10') # not a title (character repertory: part 1)
delete_if_present(refs,'2rthc10') # not a title (character repertory: part 1)
delete_if_present(refs,'hciaa10') # not a title (The Human Comedy: Introductions & Appendix)
sorted = []
sum = 0
for key in refs.keys():
t = (key,refs[key])
sum = sum + refs[key]
sorted.append(t)
sorted.sort(descending)
#print "sorted: ", sorted
print "\n", name
print "\n\nReferences Title"
for tuple in sorted:
if not titles.has_key(string.lower(tuple[0])):
print "[%s] not in titles table" % (tuple[0])
else:
title = titles[string.lower(tuple[0])]
percent = "%s" % ((tuple[1] *100) / sum)
print "%s (%s%%) %s (%s)" % (tuple[1],percent,title,tuple[0])
print "\ntotal references: ", sum
#----------------------------------------------------------------------
# main program:
if __name__ == "__main__":
if len(sys.argv) != 3:
#print "Usage: ", sys.argv[0], "referencefile.txt name"
filename = "bianchonrefs.txt"
name = "Bianchon"
count_references(filename,name)
else:
filename = sys.argv[1]
name = sys.argv[2]
count_references(filename,name)
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|