# doesn't work yet - under construction
# tile1 -
# simple text tiling
# once we have extracted the words from the file, we don't need to calculate the
# list of token sequences since we they are given by are regular sequence of indices,
# since 4 token sequences form a block and 20 words form a token sequence,
# can collect the words in a block with a simple list slice.
#=================================================================
import re, sys, string, glob, os.path
class TextTile:
def __init__(self,file,sequences_per_block,words_per_sequence):
self.file = file
self.words = extractwords(file)
self.blockwordsize = sequences_per_block * words_per_sequence;
self.word_count = len(self.words)
self.gap_count = self.word_count / self.words_per_sequence
self.cohesion = [0] * self.gap_count
first_gap_index = self.words_per_sequence * sequences_per_block - 1
last_gap_index = self.word_count - self.first_gap_index
gap_count = last_gap_index - first_gap_index
for gap in range(0,gap_count):
gap_index = first_gap_index + (words_per_sequence * gap)
left_block = self.words[gap_index-blockwordsize:gap_index]
right_block = self.words[gap_index:gap_index+blockwordsize]
left_termvector = get_termvector
cohesion[gap] = get_cohesion(left_block,right_block)
def get_cohesion(self,left_block,right_block):
# get term vectors for left and right vector
left_termvector, right_termvector = self.get_termvectors(left_block,right_block)
cohesion = self.cosine(left_termvector,right_termvector)
# then find correlation coefficients for
return cohesion
def cosine(self,v1,v2):
if len(v1) != len(v2):
print "ERROR: length of vectors is different, cannot calculate cosine"
print "v1: ", v1
print "v2: ", v2
return 0
for i in range(0,len(v1)):
v1_sumofsquares = v1[i] ** 2
v2_sumofsquares = v2[i] ** 2
crossproduct = v1[i] * v2[i]
denominator = sqrt(v1_sumofsquares) * sqrt(v2_sumofsquares)
if denominator == 0:
print "ERROR: denominator of cosine is zero\nv1: ", v1, "\nv2: ", v2
return 0
cosine = crossproduct / denominator
return cosine
def get_termvectors(self,left_block,right_block):
# create a bag of words in both the left and right blocks (duplicates included)
# get union of words in left and right blocks (duplicates eliminated)
# sort this set, this order defines the order of words in the term vector
# create two lists the length of the union, initialize them to zero,
# these will be the left and right term vectors
# iterate through the words of the word bag,
# calculate the term frequencies
# if a word is in the left_block, increment the count for this word in the term vector
# if a word is in the right_block, increment the count for this word in the term vector
# apply transforms to the left and right term vectors
# calculate cosine of left and right term vectors
print "GET_TERMVECTORS"
def extractwords(self,file):
infile = open(file,'r')
text = ""
for line in infile.readlines():
text = text + string.strip(line) + " "
infile.close()
words = re.findall("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",text)
return words
def get_block_words(self,gap):
// iterate over sequence gaps
left_block = self.words[gap-blockwordsize:gap]
right_block = self.words[gap:gap+blockwordsize]
cohesion[gap] = get_cohesion(left_block,right_block)
#----------------------------------------------------------------------
# main program:
if __name__ == "__main__":
if len(sys.argv) != 2:
#print "Usage: ", sys.argv[0], "directory"
dir = "c:\\Balzac\\tmp\\"
filename = ""c:\\Balzac\\balzacbooksnofront\\adieu10.txt"
words_per_sequence = 20;
sequences_per_block = 2;
tiling = TextTile(file,sequences_per_block,words_per_sequence)
else
dir = sys.argv[1]
wordcounts = wordsperfile_indirectory(dir)
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|