# wordsperfile2 -
# attempting to improve performance
# To rank Balzac's works according to size, count the number of words in
# each Project Gutenberg file containing one of his works, create a list
# of tuples (title,word_count) and sort the list on word_count.
# C:\Python20>python wordsperfile2.txt >wordsperfile2out.txt
#=================================================================
import re, sys, string, glob, os.path
def loadtable_balzactitles():
tablefilename = "balzactitles.txt"
infile = open(tablefilename,"r")
tuples = []
for line in infile.readlines():
m = re.match('\s*(\w+10)\.txt(.+)',line)
g = m.groups()
#print g
tuples.append(g)
infile.close()
table = {}
for tuple in tuples:
key = string.strip(tuple[0])
value = string.strip(tuple[1])
table[key] = value
return table
def length_classify(wordcount):
if wordcount > 80000:
return 'very long'
elif wordcount > 50000:
return 'long'
elif wordcount > 26000:
return 'medium'
elif wordcount > 12000:
return 'short'
else:
return 'very short'
def descending(tuple1,tuple2):
return cmp(tuple2[2],tuple1[2])
def countwords(file):
infile = open(file,'r')
wordcount = 0
for line in infile.readlines():
words = re.findall("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",line) # better tokenizer?
wordcount = wordcount + len(words)
infile.close()
return wordcount
def wordsperfile_indirectory(dir):
titles = loadtable_balzactitles()
# get all the files in the directory
allfiles = dir + '*.*'
fileset = glob.glob(allfiles)
# iterate over each file
wordcounts = []
for path in fileset:
pathsplit = os.path.split(path)
filename = pathsplit[1]
wordcount = countwords(path)
m = re.search("\s*(\w+?)\.txt",filename,re.IGNORECASE)
if not m:
print "ERROR: Can't get prefix from filename: ", filename
else:
g = m.groups()
file_prefix = g[0]
if not titles.has_key(string.lower(file_prefix)):
print "[%s] not in titles table" % (file_prefix)
tuple = ("",file_prefix,wordcount)
else:
title = titles[string.lower(file_prefix)]
tuple = (title,file_prefix,wordcount,length_classify(wordcount))
#print tuple
wordcounts.append(tuple)
wordcounts.sort(descending)
return wordcounts
#----------------------------------------------------------------------
# main program:
if __name__ == "__main__":
if len(sys.argv) != 2:
#print "Usage: ", sys.argv[0], "directory"
#dir = "c:\\Balzac\\tmp\\"
dir = "c:\\Balzac\\balzacbooksnofront\\"
print "----------Count words in files-----"
wordcounts = wordsperfile_indirectory(dir)
print "Title Word Count"
for wordcount in wordcounts:
print "%s (%s): %s words (%s)" % (wordcount[0],wordcount[1],wordcount[2],wordcount[3])
#print wordcount[0], " (", wordcount[1], "): ", wordcount[2], " words "
print "%s works have been counted" % (len(wordcounts))
else:
dir = sys.argv[1]
wordcounts = wordsperfile_indirectory(dir)
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|