# doesn't work yet - under construction
# tile2 -
# iterate over blocks building a histogram
# build a histogram of the histograms
# look at the shape of the histograms change as block size changes
# as block size changes how does the average frequency for an included word change?
import re, sys, string, glob, os.path
class Histogram:
def __init__(self,wc):
# iterate over all the keys creating another dict with the count
# of keys with a given count
hist = {}
words = wc.keys()
for word in words:
count = wc[word]
if hist.has_key(wc[word]):
hist[word] = hist[word]++
else:
hist[word] = 1
self.hist = hist
def sample_average(self):
histogram = self.hist
n = len(histogram)
sum = 0
for item in histogram:
sum = sum + (item.frequency * item.value)
return sum / n
def sample_variance(self):
histogram = self.hist
n = len(histogram)
avg = sample_average(histogram)
sum = 0
for item in histogram:
sum = sum + (item.frequency * (item.value - avg))
return sqrt(sum / (n - 1))
class TextTile:
def __init__(self,file,words_per_block):
self.file = file
self.blocksize = words_per_block
self.words = self.extractwords(file)
self.word_count = len(self.words)
self.block_count = word_count / words_per_block
def extractwords(self,file):
infile = open(file,'r')
text = ""
for line in infile.readlines():
text = text + string.strip(line) + " "
infile.close()
words = re.findall("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",text)
return words
def get_average_frequency_variance(self,words):
# wc: word -> freq in block
# hist: freq in block -> freq amongst words
af = []
for i in range(0,self.block_count-1):
begin = i * self.blocksize
end = begin + self.blocksize - 1
block_slice = self.words[begin:end]
wc = self.count_words(block_slice)
hist = Histogram(wc)
sample_ average = hist.sample_average()
sample_variance = hist.sample_variance()
af.append(sample_variance)
average_frequency_variance = self.average(af)
return average_frequency_variance
def count_words(self,list):
wc = {}
for word in list:
if wc.has_key(word):
wc[word] = wc[word]++
else:
wc[word] = 1
return wc
def average_list(self,list):
count = len(list)
if count == 0:
print "ERROR: average of 0 items is undefined"
sum = 0
for item in list:
sum = sum + item
avg = sum / count
return avg
def variance_list(self,list):
avg = self.average(list)
count = len(list)
if count == 0:
print "ERROR: average of 0 items is undefined"
sum = 0
for item in list:
sum = sum + (item - avg) ** 2
var = sqrt(sum / (count - 1))
return var
def print_words(self,filename):
outfile = open(filename,'w')
for i in range(0,len(tile1.words)-1):
outfile.write("%s:%s\n" % (i,self.words[i]))
outfile.close()
words_per_block = 200;
filename = "c:\\Balzac\\balzacbooksnofront\\gbsek10.txt"
tile1 = TextTile(filename,sequences_per_block,words_per_block)
tile1.print_words("junk3.txt")
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|