# wordcount1 -
# This is the backend program for wordcount0.py and wordcount1.py
# which reduce a Project Gutenberg file to a list of words in the
# file with a count for each word, this backend just merges the
# separate wordcount files into one master wordcount file for
# all the Project Gutenberg files in a directory (here the complete works of Balzac).
# (note: Why have two programs, wordcount0 and wordcount1, instead of one?
# The C++ program that counts the number of occurences of a word in a file
# is portable and simple. If I was to add the logic that iterates over all
# the files in a directory for Windows98/DOS the program would be not be portable
# anymore and since I'm planning to eventually move to Linux I'm trying to
# keep everyting maximally portable and simple. Also Python proved too slow in parsing
# the initial wordcounts out of the Balzac story files (wordcount0), but very fast
# in summarizing the wordcount records that wordcount0 produced.)
# (note: the files that do not contain summary material and not
# a story written by Balzac: (shciaa10.txt,2rthc10.txt,2rthc10.txt) must be eliminated
# from the directory of count files before running this program.)
# run from DOS prompt:
# C:\Python20>python wordcount1.txt
# running it against the complete works of Balzac I got a total of
# 4,479,835 (key: 'aaatotal') words and 43,930 unique words which seems
# like a pretty good special purpose corpus...
# with a lot of well-focused rich semantic ore....now to mine the collocations
# of the word "nose" (count: 377) and delve into the "science" of phrenology that Balzac
# practiced in designing his characters....
#======================================================================================
import re, sys, string, glob, os.path, os
def make_wordcounts(count_dir):
# get all the files in the directory
allfiles = count_dir + '*.*'
fileset = glob.glob(allfiles)
# iterate over each file, accumulate wordcounts into a dictionary
wc = {}
total = 0
for text in fileset:
print text
infile = open(text,'r')
for line in infile.readlines():
w = string.split(line,':')
if len(w) == 2:
word = w[0]
count = int(w[1])
total = total + count
if wc.has_key(word):
wc[word] = wc[word] + count
else:
wc[word] = count
infile.close()
wc['aaatotal'] = total
return wc
def make_master_wordcount(wc,master_file):
outfile = open(master_file,'w')
unique_word_ctr = 0
wckeys = wc.keys()
wckeys.sort()
for word in wckeys:
outfile.write("%s:%s\n" % (word,wc[word]))
unique_word_ctr = unique_word_ctr + 1
outfile.close()
print "total unique words: ", unique_word_ctr
#----------------------------------------------------------------------
# main program:
if __name__ == "__main__":
if len(sys.argv) != 2:
#print "Usage: ", sys.argv[0], "directory"
count_dir = "c:\\Balzac\\balzacbooksnofront\\wordcounts\\"
master_file = "c:\\python20\\balzacwords.txt"
wc = make_wordcounts(count_dir)
make_master_wordcount(wc,master_file)
else:
count_dir = sys.argv[1]
wc = make_wordcounts(count_dir)
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|