# textwindow6 -
# finds places in a Project Gutenberg file where a word or phrase occurs (w/ a regex),
# extracts a context of five words on either side of the word,
# and prints the word in context (the larger context of the line in the file
# containing the word concatenated with the line before and after, is also provided) 
 
import re, string

class WordContext:
      def __init__(self,re_word):
          self.re_word = re_word

      def get_word_indices(self,str):
          pos = 0
          matches = []
          #re_word = "[\s,.!?;\-\"\']*(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;\-\"\']+"
          re_pattern = re.compile(self.re_word,re.IGNORECASE)
          match = 1
          while (pos < len(str)) and match:
                match = re_pattern.search(str,pos)
                if match:
                   #print "groups: ", match.groups()
                   span = match.span()
                   #print "span: ", span
                   pos = span[1]
                   matches.append(span)
          return matches

      def get_left_context(self,s):
          w = self.get_word_indices(s)
          if len(w) > 0:
             start = w[0][0]
             end   = w[0][1]
          else:
             return ''
          if len(w) > 1:
             for i in range(1,min(5,len(w))):
                 end = w[i][1]
          context = s[start:end]
          return context

      def get_right_context(self,s):
          w = self.get_word_indices(s)
          last  = len(w) - 1
          if len(w) > 0:
             start = w[last][0]
             end   = w[last][1]
          else:
             return ''
          if len(w) > 1:
             for i in range(1,min(5,len(w))):
                 start = w[last-i][0]
          context = s[start:end]
          return context

class TextWindow:
      def __init__(self,pattern,re_word):
          self.pattern = pattern 
          self.re_word = re_word
          self.window = ['','','']
          self.front = 0
          self.wc = WordContext(self.re_word)

      def insert(self,s):
          self.window[self.front] = string.rstrip(s) + ' '
          self.front = (self.front + 1) % 3

      def buffer_full(self):
          return (len(self.window[0]) != 0) and \
                 (len(self.window[1]) != 0) and \
                 (len(self.window[2]) != 0)

      def search(self):
          front = middle = back = lines = ''
          if self.buffer_full():
             match = re.search(self.pattern,self.window[1])
             if match:
                lines = string.rstrip(self.window[0]) + ' ' + \
                        string.rstrip(self.window[1]) + ' ' + \
                        string.rstrip(self.window[2])
                match = re.search(self.pattern,lines)
                span  = match.span() 
                start = span[0]
                end   = span[1]
                left  = lines[0:start]
                middle = lines[start:end]
                right = lines[end:-1]
                front = self.wc.get_right_context(left) 
                back  = self.wc.get_left_context(right)
          return front, middle, back, lines
                  
#===================================================================
# main:

filename = "c:\\Balzac\\balzacbooksnofront\\thrty10.txt"
re_word = "[\s,.!?;\-\"\']*(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;\-\"\']+"
tw = TextWindow('face',re_word)
infile = open(filename,'r')
for line in infile.readlines():  
    tw.insert(line)
    front, middle, back, lines = tw.search()
    if len(front) + len(middle) + len(back) > 0:
       print "%s | %s | %s \n%s\n" % (front, middle, back, lines)
infile.close()
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)