# textwindow8 -
# collects "span occurences" of a word, used to calculate the "span frequency"
# which is used to find significant "collocations" of a word in a "corpus" 

# read whole file into a string, find the offsets for each occurence of the word 
# being searched for, iterate through these occurences, 
# extract a sub-string of 80 characters on either side, 
# and then extract the left and right neighbors of the word occurence from these sub-strings

# iterates over all the files in a collection of Project Gutenberg files
# (here the collected works of Balzac),
# finds occurences of a word or phrase(w/ a regex),
# extracts a context of five words on either side of the word,
# and dumps these into a bag 
# (a Python list which allows dupicates unlike a set or a Python dictionary key),
#  and finally writes the bag to a file, one word per line,
# To summarize: this file or bag contains every word occurence within a 
# window of five words from the word being matched for every story that Balzac wrote. 
# To be used to find significant collocations which are identified by statistics
# such as the t-statistic which use the "span frequency" of a word which is just
# the frequency count of a word in the file we have just constructed. 
#====================================================================
 
import re, string

class StringObject:
      def __init__(self,infilename):
          infile = open(infilename,'r')
          lines = infile.readlines()
          self.text = string.join(lines,' ')
          infile.close()
          self.length = len(self.text)

      def get_pattern_indices(self,pattern):
          # return a list of tuples with the start and end indices of substrings
          # that match the pattern
          pos = 0
          matches = []
          re_pattern = re.compile(pattern,re.IGNORECASE)
          match = 1
          while (pos < self.length) and match:
                match = re_pattern.search(self.text,pos)
                if match:
                   #print "groups: ", match.groups()
                   span = match.span()
                   #print "span: ", span
                   pos = span[1]
                   matches.append(span)
          return matches


def get_right_neighbors(s,re_word):
    w = re.findall(re_word,s)
    neighbors = []
    for i in range(0,min(5,len(w))):
        neighbors.append(w[i])
    return neighbors

def get_left_neighbors(s,re_word):
    w = re.findall(re_word,s)
    neighbors = []
    last  = len(w) - 1
    for i in range(0,min(5,len(w))):
        neighbors.append(w[last-i])
    return neighbors
                  
#===================================================================
# main:

filename = "c:\\Balzac\\balzacbooksnofront\\thrty10.txt"
re_word = "[\s,.!?;\-\"\']*(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;\-\"\']+"
s = StringObject(filename)
# print s.text
middle = 'face'
indices = s.get_pattern_indices(middle)
for tuple in indices: 
    left  = tuple[0]
    right = tuple[1]
    front = get_left_neighbors(s.text[left-80:left],re_word)
    back  = get_right_neighbors(s.text[right:right+80],re_word)
    big   = s.text[left-80:right+80] 
    print "%s | %s | %s\n%s\n" % (front, middle, back, big)
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)