# doesn't work yet - under construction

# tile2 - 

# iterate over blocks building a histogram
# build a histogram of the histograms
# look at the shape of the histograms change as block size changes
# as block size changes how does the average frequency for an included word change?

import re, sys, string, glob, os.path

class Histogram:
   def __init__(self,wc):
       # iterate over all the keys creating another dict with the count 
       # of keys with a given count
       hist = {}
       words = wc.keys()
       for word in words:
           count = wc[word]
           if hist.has_key(wc[word]):
              hist[word] = hist[word]++
           else:
              hist[word] = 1
       self.hist = hist

   def sample_average(self):
       histogram = self.hist
       n = len(histogram)
       sum = 0
       for item in histogram:
           sum = sum + (item.frequency * item.value)
       return sum / n

   def sample_variance(self):
       histogram = self.hist
       n = len(histogram)
       avg = sample_average(histogram)
       sum = 0
       for item in histogram:
           sum = sum + (item.frequency * (item.value - avg))
       return sqrt(sum / (n - 1))


class TextTile:
   def __init__(self,file,words_per_block):
       self.file  = file
       self.blocksize = words_per_block
       self.words = self.extractwords(file)
       self.word_count = len(self.words)
       self.block_count = word_count / words_per_block

   def extractwords(self,file):
      infile = open(file,'r')
      text = ""
      for line in infile.readlines():
         text = text + string.strip(line) + " "
      infile.close()
      words = re.findall("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",text)
      return words

   def get_average_frequency_variance(self,words):
       # wc: word -> freq in block
       # hist: freq in block -> freq amongst words
       af = []
       for i in range(0,self.block_count-1):
           begin = i * self.blocksize
           end   = begin + self.blocksize - 1 
           block_slice = self.words[begin:end]
           wc = self.count_words(block_slice)
           hist = Histogram(wc)
           sample_ average = hist.sample_average()
           sample_variance = hist.sample_variance()
           af.append(sample_variance)
       average_frequency_variance = self.average(af)
       return average_frequency_variance

   def count_words(self,list):
       wc = {}
       for word in list:
           if wc.has_key(word):
              wc[word] = wc[word]++
           else:
              wc[word] = 1
       return wc

   def average_list(self,list):
       count = len(list)
       if count == 0:
          print "ERROR: average of 0 items is undefined"
       sum = 0
       for item in list:
          sum = sum + item    
       avg = sum / count 
       return avg    

   def variance_list(self,list):
       avg = self.average(list)
       count = len(list)
       if count == 0:
          print "ERROR: average of 0 items is undefined"
       sum = 0
       for item in list:
          sum = sum + (item - avg) ** 2   
       var = sqrt(sum / (count - 1)) 
       return var    

   def print_words(self,filename):
      outfile = open(filename,'w')
      for i in range(0,len(tile1.words)-1):
         outfile.write("%s:%s\n" % (i,self.words[i]))
      outfile.close()


words_per_block = 200;      
filename = "c:\\Balzac\\balzacbooksnofront\\gbsek10.txt"
tile1 = TextTile(filename,sequences_per_block,words_per_block)
tile1.print_words("junk3.txt")
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)