# countrefs1 -
 
# count the number of references to a given word or phrase in 
# a Balzac title.

#C:\Python20>python countrefs1.txt >countrefs1out.txt

#========================================================================
# What the program does in words:

# For all lines in the reference file.

# Line containing a reference if it begins with a file reference,
# recognized/identified by the regex for a file path.

# If the line contains a reference the next step is to strip
# off the file name without the ".txt" suffix that gives
# the Project Gutenberg title identifier for the Balzac work (e.g. brcrc10).

# Next discard the reference if it is part of the character
# repertory or introduction and not a story: (1rthc10,2rthc10).

# Next, if the title already has a count associated with it
# increment the count, otherwise create a counter and set it to 1.

# (also note although I used the grep/search in EditPlus
# (Microsoft VC++ IDE would also work) it would be a good 
# idea to rewrite this grep in an independent stand-alone 
# utility program that could be integrated into other programs)
#=========================================================================
import re, string, sys

def loadtable_balzactitles():
    tablefilename = "balzactitles.txt"
    infile = open(tablefilename,"r")
    tuples = []
    for line in infile.readlines():
        m = re.match('\s*(\w+10)\.txt(.+)',line)
        g = m.groups()
        #print g
        tuples.append(g) 
    infile.close()

    table = {}
    for tuple in tuples:
        key   = string.strip(tuple[0])
        value = string.strip(tuple[1])
        table[key] = value         
    return table

def descending(tuple1,tuple2):
    return cmp(tuple2[1],tuple1[1])

def delete_if_present(dict,key):
    if dict.has_key(key):
       del dict[key]

def count_references(filename,name):
    # count the references to a title
    titles = loadtable_balzactitles()
    re_path = re.compile('\w+:',re.IGNORECASE)    # change this to a full path recognizer
    #re_title = re.compile('(\w+10)\.txt',re.IGNORECASE)
    re_title = re.compile('(\w\w\w\w\w10)\.txt',re.IGNORECASE)
    refs = {}
    infile = open(filename, 'r')
    for line in infile.readlines():
        #print "line: ", line
        is_reference = re_path.match(line)
        if is_reference:
            is_title = re_title.search(line)
            m = is_title.groups()
            title = m[0] 
            title = title.strip()
            if not refs.has_key(title):
               refs[title] = 1
            else:
               refs[title] = refs[title] + 1
    infile.close()

    delete_if_present(refs,'1rthc10')    # not a title (character repertory: part 1)
    delete_if_present(refs,'2rthc10')    # not a title (character repertory: part 1)
    delete_if_present(refs,'hciaa10')    # not a title (The Human Comedy:  Introductions & Appendix)

    sorted = []
    sum = 0
    for key in refs.keys():
        t = (key,refs[key])
        sum = sum + refs[key]
        sorted.append(t)
    sorted.sort(descending)

    #print "sorted: ", sorted
    print "\n", name
    print "\n\nReferences Title"
    for tuple in sorted:
        if not titles.has_key(string.lower(tuple[0])):
           print "[%s] not in titles table" % (tuple[0])
        else:
           title = titles[string.lower(tuple[0])]
           percent = "%s" % ((tuple[1] *100) / sum)
           print "%s  (%s%%) %s (%s)" % (tuple[1],percent,title,tuple[0])

    print "\ntotal references: ", sum


#----------------------------------------------------------------------
# main program:

if __name__ == "__main__":
   if len(sys.argv) != 3:
      #print "Usage: ", sys.argv[0], "referencefile.txt name"
      filename = "bianchonrefs.txt"
      name = "Bianchon"
      count_references(filename,name) 
   else:
      filename = sys.argv[1] 
      name = sys.argv[2]
      count_references(filename,name)
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)