# textwindow11 -

#========================================================================================= 
# calculates the "span frequency" of a word as a collocate of another word in a corpus
# e.g. the 5-word-to-either-side span frequency of the word 'sharp' 
# as a collocate of the word 'nose' in the corpus of all Balzac's stories

# read whole file into a string, find the offsets for each occurence of the word 
# being searched for, iterate through these occurences, 
# extract a sub-string of 80 characters on either side, 
# and then extract the left and right neighbors of the word occurence from these sub-strings

# iterates over all the files in a collection of Project Gutenberg files
# (here the collected works of Balzac),
# finds occurences of a word or phrase(w/ a regex),
# extracts a context of five words on either side of the word,
# and dumps these into a bag 
# (a Python list which allows dupicates unlike a set or a Python dictionary key),
#  and finally writes the bag to a file, one word per line,
# To summarize: this file or bag contains every word occurence within a 
# window of five words from the word being matched for every story that Balzac wrote. 
# To be used to find significant collocations which are identified by statistics
# such as the t-statistic which use the "span frequency" of a word which is just
# the frequency count of a word in the file we have just constructed. 
#====================================================================
 
import re, string, glob

class StringObject:
      def __init__(self,infilename):
          infile = open(infilename,'r')
          lines = infile.readlines()
          self.text = string.join(lines,' ')
          infile.close()
          self.length = len(self.text)

      def get_pattern_indices(self,pattern):
          # return a list of tuples with the start and end indices of substrings
          # that match the pattern
          pos = 0
          matches = []
          re_pattern = re.compile(pattern,re.IGNORECASE)
          match = 1
          while (pos < self.length) and match:
                match = re_pattern.search(self.text,pos)
                if match:
                   #print "groups: ", match.groups()
                   span = match.span()
                   #print "span: ", span
                   pos = span[1]
                   matches.append(span)
          return matches

def get_right_neighbors(s,re_word,neighbors):
    w = re.findall(re_word,s)
    for i in range(0,min(5,len(w))):
        neighbors.append(string.lower(w[i]))

def get_left_neighbors(s,re_word,neighbors):
    w = re.findall(re_word,s)
    last  = len(w) - 1
    for i in range(0,min(5,len(w))):
        neighbors.append(string.lower(w[last-i]))
                  
def get_word_count(words):
    count = {}
    for word in words:
        if count.has_key(word):
           count[word] = count[word] + 1
        else:
           count[word] = 1 
    return count

def descending(tuple1,tuple2):
    return cmp(tuple2[1],tuple1[1])

def sort_alphabetically(word_count):
    keys   = word_count.keys()    
    keys.sort() 
    return keys

def write_frequency_file_dictionary(filename,dict,keys):
    # outfilename = 'facecount1.txt'
    outfile = open(filename,'w')
    for word in keys:
        # print "%s:%s\n" % (word,dict[word])
        outfile.write("%s:%s\n" % (word,dict[word]))
    outfile.close()

def sort_by_count(word_count):
    keys = []
    for word in word_count.keys():
        tuple = (word,word_count[word])
        keys.append(tuple)
    keys.sort(descending)
    return keys

def write_frequency_file_tuplelist(filename,tuplelist):
    # outfilename = 'facecount2.txt'
    outfile = open(filename,'w')
    for tuple in tuplelist:
        #print "%s:%s\n" % (tuple[0],tuple[1])
        outfile.write("%s:%s\n" % (tuple[0],tuple[1]))
    outfile.close()

def get_span(filename,re_word,word,neighbors):
    s = StringObject(filename)
    middle = word
    indices = s.get_pattern_indices(middle)
    for tuple in indices: 
        left  = tuple[0]
        right = tuple[1]
        get_left_neighbors(s.text[left-80:left],re_word,neighbors)
        get_right_neighbors(s.text[right:right+80],re_word,neighbors)

def get_span_directory(directory,re_word,word):
    allfiles = directory + '*.*'
    fileset = glob.glob(allfiles)
    neighbors = []
    for path in fileset:
        get_span(path,re_word,word,neighbors)
    return neighbors

#===================================================================
# main:

directory = "c:\\Balzac\\balzacbooksnofront\\"
re_word = "[\s,.!?;\-\"\']*(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;\-\"\']+"
word = 'nose'

neighbors = get_span_directory(directory,re_word,word)

word_count = get_word_count(neighbors)

keys = sort_alphabetically(word_count)
write_frequency_file_dictionary('facecount5.txt',word_count,keys)

tuplelist = sort_by_count(word_count)
write_frequency_file_tuplelist('facecount6.txt',tuplelist)
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)