# wordsperfile2 -

# attempting to improve performance

# To rank Balzac's works according to size, count the number of words in
# each Project Gutenberg file containing one of his works, create a list
# of tuples (title,word_count) and sort the list on word_count. 
# C:\Python20>python wordsperfile2.txt >wordsperfile2out.txt
#=================================================================

import re, sys, string, glob, os.path

def loadtable_balzactitles():
    tablefilename = "balzactitles.txt"
    infile = open(tablefilename,"r")
    tuples = []
    for line in infile.readlines():
        m = re.match('\s*(\w+10)\.txt(.+)',line)
        g = m.groups()
        #print g
        tuples.append(g) 
    infile.close()

    table = {}
    for tuple in tuples:
        key   = string.strip(tuple[0])
        value = string.strip(tuple[1])
        table[key] = value         
    return table

def length_classify(wordcount):
    if wordcount > 80000:
       return 'very long'
    elif wordcount > 50000:
       return 'long'
    elif wordcount > 26000:
       return 'medium'
    elif wordcount > 12000:
       return 'short'
    else:
       return 'very short'

def descending(tuple1,tuple2):
    return cmp(tuple2[2],tuple1[2])

def countwords(file):
    infile = open(file,'r')
    wordcount = 0  
    for line in infile.readlines():
        words = re.findall("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",line)   # better tokenizer?
        wordcount = wordcount + len(words)
    infile.close()
    return wordcount

def wordsperfile_indirectory(dir):
    titles = loadtable_balzactitles()
    # get all the files in the directory
    allfiles = dir + '*.*'
    fileset = glob.glob(allfiles)
    # iterate over each file
    wordcounts = []
    for path in fileset:
        pathsplit = os.path.split(path)
        filename = pathsplit[1]
        wordcount = countwords(path)
        m = re.search("\s*(\w+?)\.txt",filename,re.IGNORECASE)
        if not m:
           print "ERROR: Can't get prefix from filename: ", filename
        else:
           g = m.groups()
           file_prefix = g[0] 
           if not titles.has_key(string.lower(file_prefix)):
              print "[%s] not in titles table" % (file_prefix)
              tuple = ("",file_prefix,wordcount)
           else:
              title = titles[string.lower(file_prefix)]
              tuple = (title,file_prefix,wordcount,length_classify(wordcount))
           #print tuple
           wordcounts.append(tuple)
    wordcounts.sort(descending)
    return wordcounts


#----------------------------------------------------------------------
# main program:

if __name__ == "__main__":
   if len(sys.argv) != 2:
      #print "Usage: ", sys.argv[0], "directory"
      #dir  = "c:\\Balzac\\tmp\\"
      dir  = "c:\\Balzac\\balzacbooksnofront\\"
      print "----------Count words in files-----" 
      wordcounts = wordsperfile_indirectory(dir) 
      print "Title     Word Count"
      for wordcount in wordcounts:
          print "%s (%s): %s words (%s)" % (wordcount[0],wordcount[1],wordcount[2],wordcount[3])
          #print wordcount[0], " (", wordcount[1], "): ", wordcount[2], " words "
      print "%s works have been counted" % (len(wordcounts))
   else:
      dir  = sys.argv[1] 
      wordcounts = wordsperfile_indirectory(dir)
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)