# textwindow6 -
# finds places in a Project Gutenberg file where a word or phrase occurs (w/ a regex),
# extracts a context of five words on either side of the word,
# and prints the word in context (the larger context of the line in the file
# containing the word concatenated with the line before and after, is also provided)
import re, string
class WordContext:
def __init__(self,re_word):
self.re_word = re_word
def get_word_indices(self,str):
pos = 0
matches = []
#re_word = "[\s,.!?;\-\"\']*(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;\-\"\']+"
re_pattern = re.compile(self.re_word,re.IGNORECASE)
match = 1
while (pos < len(str)) and match:
match = re_pattern.search(str,pos)
if match:
#print "groups: ", match.groups()
span = match.span()
#print "span: ", span
pos = span[1]
matches.append(span)
return matches
def get_left_context(self,s):
w = self.get_word_indices(s)
if len(w) > 0:
start = w[0][0]
end = w[0][1]
else:
return ''
if len(w) > 1:
for i in range(1,min(5,len(w))):
end = w[i][1]
context = s[start:end]
return context
def get_right_context(self,s):
w = self.get_word_indices(s)
last = len(w) - 1
if len(w) > 0:
start = w[last][0]
end = w[last][1]
else:
return ''
if len(w) > 1:
for i in range(1,min(5,len(w))):
start = w[last-i][0]
context = s[start:end]
return context
class TextWindow:
def __init__(self,pattern,re_word):
self.pattern = pattern
self.re_word = re_word
self.window = ['','','']
self.front = 0
self.wc = WordContext(self.re_word)
def insert(self,s):
self.window[self.front] = string.rstrip(s) + ' '
self.front = (self.front + 1) % 3
def buffer_full(self):
return (len(self.window[0]) != 0) and \
(len(self.window[1]) != 0) and \
(len(self.window[2]) != 0)
def search(self):
front = middle = back = lines = ''
if self.buffer_full():
match = re.search(self.pattern,self.window[1])
if match:
lines = string.rstrip(self.window[0]) + ' ' + \
string.rstrip(self.window[1]) + ' ' + \
string.rstrip(self.window[2])
match = re.search(self.pattern,lines)
span = match.span()
start = span[0]
end = span[1]
left = lines[0:start]
middle = lines[start:end]
right = lines[end:-1]
front = self.wc.get_right_context(left)
back = self.wc.get_left_context(right)
return front, middle, back, lines
#===================================================================
# main:
filename = "c:\\Balzac\\balzacbooksnofront\\thrty10.txt"
re_word = "[\s,.!?;\-\"\']*(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;\-\"\']+"
tw = TextWindow('face',re_word)
infile = open(filename,'r')
for line in infile.readlines():
tw.insert(line)
front, middle, back, lines = tw.search()
if len(front) + len(middle) + len(back) > 0:
print "%s | %s | %s \n%s\n" % (front, middle, back, lines)
infile.close()
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|