# wordcount1 - 

# This is the backend program for wordcount0.py and wordcount1.py
# which reduce a Project Gutenberg file to a list of words in the 
# file with a count for each word, this backend just merges the 
# separate wordcount files into one master wordcount file for 
# all the Project Gutenberg files in a directory (here the complete works of Balzac). 
 
# (note: Why have two programs, wordcount0 and wordcount1, instead of one?
# The C++ program that counts the number of occurences of a word in a file
# is portable and simple. If I was to add the logic that iterates over all
# the files in a directory for Windows98/DOS the program would be not be portable
# anymore and since I'm planning to eventually move to Linux I'm trying to
# keep everyting maximally portable and simple. Also Python proved too slow in parsing
# the initial wordcounts out of the Balzac story files (wordcount0), but very fast
# in summarizing the wordcount records that wordcount0 produced.)
# (note: the files that do not contain summary material and not 
# a story written by Balzac: (shciaa10.txt,2rthc10.txt,2rthc10.txt) must be eliminated 
# from the directory of count files before running this program.)

# run from DOS prompt:
# C:\Python20>python wordcount1.txt

# running it against the complete works of Balzac I got a  total of 
# 4,479,835 (key: 'aaatotal') words and 43,930 unique words which seems 
# like a pretty good special purpose corpus...
# with a lot of well-focused rich semantic ore....now to mine the collocations 
# of the word "nose" (count: 377) and delve into the "science" of phrenology that Balzac 
# practiced in designing his characters.... 
#======================================================================================

import re, sys, string, glob, os.path, os

def make_wordcounts(count_dir):
    # get all the files in the directory
    allfiles = count_dir + '*.*'
    fileset = glob.glob(allfiles)
    # iterate over each file, accumulate wordcounts into a dictionary
    wc = {}
    total = 0
    for text in fileset:
        print text
        infile = open(text,'r')
        for line in infile.readlines():
            w = string.split(line,':')
            if len(w) == 2:
               word  = w[0]
               count = int(w[1]) 
               total = total + count
               if wc.has_key(word):
                  wc[word] = wc[word] + count
               else:
                  wc[word] = count 
        infile.close()
    wc['aaatotal'] = total
    return wc   

def make_master_wordcount(wc,master_file):
      outfile = open(master_file,'w')
      unique_word_ctr = 0
      wckeys = wc.keys()
      wckeys.sort()
      for word in wckeys:
          outfile.write("%s:%s\n" % (word,wc[word])) 
          unique_word_ctr = unique_word_ctr + 1
      outfile.close()
      print "total unique words: ", unique_word_ctr 

#----------------------------------------------------------------------
# main program:

if __name__ == "__main__":
   if len(sys.argv) != 2:
      #print "Usage: ", sys.argv[0], "directory"
      count_dir = "c:\\Balzac\\balzacbooksnofront\\wordcounts\\"
      master_file = "c:\\python20\\balzacwords.txt"
      wc = make_wordcounts(count_dir)
      make_master_wordcount(wc,master_file)
   else:
      count_dir  = sys.argv[1] 
      wc = make_wordcounts(count_dir)
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)