# doesn't work yet - under construction

# tile1 - 
# simple text tiling 

# once we have extracted the words from the file, we don't need to calculate the
# list of token sequences since we they are given by are regular sequence of indices,
# since 4 token sequences form a block and 20 words form a token sequence, 
# can collect the words in a block with a simple list slice.
#=================================================================
import re, sys, string, glob, os.path

class TextTile:
   def __init__(self,file,sequences_per_block,words_per_sequence):
       self.file  = file
       self.words = extractwords(file)
       self.blockwordsize = sequences_per_block * words_per_sequence;
       self.word_count = len(self.words)
       self.gap_count  = self.word_count / self.words_per_sequence
       self.cohesion   = [0] * self.gap_count
       first_gap_index = self.words_per_sequence * sequences_per_block - 1
       last_gap_index  = self.word_count -  self.first_gap_index
       gap_count = last_gap_index - first_gap_index

       for gap in range(0,gap_count):
           gap_index = first_gap_index + (words_per_sequence * gap) 
           left_block  = self.words[gap_index-blockwordsize:gap_index]
           right_block = self.words[gap_index:gap_index+blockwordsize]
           left_termvector = get_termvector
           cohesion[gap] = get_cohesion(left_block,right_block) 


   def get_cohesion(self,left_block,right_block):
       # get term vectors for left and right vector
       left_termvector, right_termvector = self.get_termvectors(left_block,right_block) 
       cohesion = self.cosine(left_termvector,right_termvector)
       # then find correlation coefficients for
       return cohesion

   def cosine(self,v1,v2):
       if len(v1) != len(v2):
          print "ERROR: length of vectors is different, cannot calculate cosine"
          print "v1: ", v1
          print "v2: ", v2 
          return 0
       for i in range(0,len(v1)): 
           v1_sumofsquares = v1[i] ** 2
           v2_sumofsquares = v2[i] ** 2
           crossproduct = v1[i] * v2[i]
       denominator = sqrt(v1_sumofsquares) * sqrt(v2_sumofsquares)
       if denominator == 0:
          print "ERROR: denominator of cosine is zero\nv1: ", v1, "\nv2: ", v2
          return 0
       cosine = crossproduct / denominator 
       return cosine

   def get_termvectors(self,left_block,right_block):
       # create a bag of words in both the left and right blocks (duplicates included)
       # get union of words in left and right blocks (duplicates eliminated)
       # sort this set, this order defines the order of words in the term vector
       # create two lists the length of the union, initialize them to zero,
       # these will be the left and right term vectors
       # iterate through the words of the word bag, 
       # calculate the term frequencies
       # if a word is in the left_block, increment the count for this word in the term vector
       # if a word is in the right_block, increment the count for this word in the term vector
       # apply transforms to the left and right term vectors
       # calculate cosine of left and right term vectors
       print "GET_TERMVECTORS"
 
   def extractwords(self,file):
      infile = open(file,'r')
      text = ""
      for line in infile.readlines():
         text = text + string.strip(line) + " "
      infile.close()
      words = re.findall("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",text)
      return words

   def get_block_words(self,gap):
       // iterate over sequence gaps
       left_block  = self.words[gap-blockwordsize:gap]
       right_block = self.words[gap:gap+blockwordsize]
       cohesion[gap] = get_cohesion(left_block,right_block) 

#----------------------------------------------------------------------
# main program:

if __name__ == "__main__":
   if len(sys.argv) != 2:
      #print "Usage: ", sys.argv[0], "directory"
      dir  = "c:\\Balzac\\tmp\\"
      filename = ""c:\\Balzac\\balzacbooksnofront\\adieu10.txt"
      words_per_sequence = 20;
      sequences_per_block = 2;
      tiling = TextTile(file,sequences_per_block,words_per_sequence)
   else
      dir  = sys.argv[1] 
      wordcounts = wordsperfile_indirectory(dir)
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)