# textwindow8 -
# collects "span occurences" of a word, used to calculate the "span frequency"
# which is used to find significant "collocations" of a word in a "corpus"
# read whole file into a string, find the offsets for each occurence of the word
# being searched for, iterate through these occurences,
# extract a sub-string of 80 characters on either side,
# and then extract the left and right neighbors of the word occurence from these sub-strings
# iterates over all the files in a collection of Project Gutenberg files
# (here the collected works of Balzac),
# finds occurences of a word or phrase(w/ a regex),
# extracts a context of five words on either side of the word,
# and dumps these into a bag
# (a Python list which allows dupicates unlike a set or a Python dictionary key),
# and finally writes the bag to a file, one word per line,
# To summarize: this file or bag contains every word occurence within a
# window of five words from the word being matched for every story that Balzac wrote.
# To be used to find significant collocations which are identified by statistics
# such as the t-statistic which use the "span frequency" of a word which is just
# the frequency count of a word in the file we have just constructed.
#====================================================================
import re, string
class StringObject:
def __init__(self,infilename):
infile = open(infilename,'r')
lines = infile.readlines()
self.text = string.join(lines,' ')
infile.close()
self.length = len(self.text)
def get_pattern_indices(self,pattern):
# return a list of tuples with the start and end indices of substrings
# that match the pattern
pos = 0
matches = []
re_pattern = re.compile(pattern,re.IGNORECASE)
match = 1
while (pos < self.length) and match:
match = re_pattern.search(self.text,pos)
if match:
#print "groups: ", match.groups()
span = match.span()
#print "span: ", span
pos = span[1]
matches.append(span)
return matches
def get_right_neighbors(s,re_word):
w = re.findall(re_word,s)
neighbors = []
for i in range(0,min(5,len(w))):
neighbors.append(w[i])
return neighbors
def get_left_neighbors(s,re_word):
w = re.findall(re_word,s)
neighbors = []
last = len(w) - 1
for i in range(0,min(5,len(w))):
neighbors.append(w[last-i])
return neighbors
#===================================================================
# main:
filename = "c:\\Balzac\\balzacbooksnofront\\thrty10.txt"
re_word = "[\s,.!?;\-\"\']*(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;\-\"\']+"
s = StringObject(filename)
# print s.text
middle = 'face'
indices = s.get_pattern_indices(middle)
for tuple in indices:
left = tuple[0]
right = tuple[1]
front = get_left_neighbors(s.text[left-80:left],re_word)
back = get_right_neighbors(s.text[right:right+80],re_word)
big = s.text[left-80:right+80]
print "%s | %s | %s\n%s\n" % (front, middle, back, big)
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|