# textwindow11 -
#=========================================================================================
# calculates the "span frequency" of a word as a collocate of another word in a corpus
# e.g. the 5-word-to-either-side span frequency of the word 'sharp'
# as a collocate of the word 'nose' in the corpus of all Balzac's stories
# read whole file into a string, find the offsets for each occurence of the word
# being searched for, iterate through these occurences,
# extract a sub-string of 80 characters on either side,
# and then extract the left and right neighbors of the word occurence from these sub-strings
# iterates over all the files in a collection of Project Gutenberg files
# (here the collected works of Balzac),
# finds occurences of a word or phrase(w/ a regex),
# extracts a context of five words on either side of the word,
# and dumps these into a bag
# (a Python list which allows dupicates unlike a set or a Python dictionary key),
# and finally writes the bag to a file, one word per line,
# To summarize: this file or bag contains every word occurence within a
# window of five words from the word being matched for every story that Balzac wrote.
# To be used to find significant collocations which are identified by statistics
# such as the t-statistic which use the "span frequency" of a word which is just
# the frequency count of a word in the file we have just constructed.
#====================================================================
import re, string, glob
class StringObject:
def __init__(self,infilename):
infile = open(infilename,'r')
lines = infile.readlines()
self.text = string.join(lines,' ')
infile.close()
self.length = len(self.text)
def get_pattern_indices(self,pattern):
# return a list of tuples with the start and end indices of substrings
# that match the pattern
pos = 0
matches = []
re_pattern = re.compile(pattern,re.IGNORECASE)
match = 1
while (pos < self.length) and match:
match = re_pattern.search(self.text,pos)
if match:
#print "groups: ", match.groups()
span = match.span()
#print "span: ", span
pos = span[1]
matches.append(span)
return matches
def get_right_neighbors(s,re_word,neighbors):
w = re.findall(re_word,s)
for i in range(0,min(5,len(w))):
neighbors.append(string.lower(w[i]))
def get_left_neighbors(s,re_word,neighbors):
w = re.findall(re_word,s)
last = len(w) - 1
for i in range(0,min(5,len(w))):
neighbors.append(string.lower(w[last-i]))
def get_word_count(words):
count = {}
for word in words:
if count.has_key(word):
count[word] = count[word] + 1
else:
count[word] = 1
return count
def descending(tuple1,tuple2):
return cmp(tuple2[1],tuple1[1])
def sort_alphabetically(word_count):
keys = word_count.keys()
keys.sort()
return keys
def write_frequency_file_dictionary(filename,dict,keys):
# outfilename = 'facecount1.txt'
outfile = open(filename,'w')
for word in keys:
# print "%s:%s\n" % (word,dict[word])
outfile.write("%s:%s\n" % (word,dict[word]))
outfile.close()
def sort_by_count(word_count):
keys = []
for word in word_count.keys():
tuple = (word,word_count[word])
keys.append(tuple)
keys.sort(descending)
return keys
def write_frequency_file_tuplelist(filename,tuplelist):
# outfilename = 'facecount2.txt'
outfile = open(filename,'w')
for tuple in tuplelist:
#print "%s:%s\n" % (tuple[0],tuple[1])
outfile.write("%s:%s\n" % (tuple[0],tuple[1]))
outfile.close()
def get_span(filename,re_word,word,neighbors):
s = StringObject(filename)
middle = word
indices = s.get_pattern_indices(middle)
for tuple in indices:
left = tuple[0]
right = tuple[1]
get_left_neighbors(s.text[left-80:left],re_word,neighbors)
get_right_neighbors(s.text[right:right+80],re_word,neighbors)
def get_span_directory(directory,re_word,word):
allfiles = directory + '*.*'
fileset = glob.glob(allfiles)
neighbors = []
for path in fileset:
get_span(path,re_word,word,neighbors)
return neighbors
#===================================================================
# main:
directory = "c:\\Balzac\\balzacbooksnofront\\"
re_word = "[\s,.!?;\-\"\']*(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;\-\"\']+"
word = 'nose'
neighbors = get_span_directory(directory,re_word,word)
word_count = get_word_count(neighbors)
keys = sort_alphabetically(word_count)
write_frequency_file_dictionary('facecount5.txt',word_count,keys)
tuplelist = sort_by_count(word_count)
write_frequency_file_tuplelist('facecount6.txt',tuplelist)
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|