# matchline4 - finds density of topical keywords in the text
# extract all the words from the file, chunk them into contiguous
# blocks of 100 words, and count the occurence of words from certain keyword groups
# in each block, then scale the keyword count for each block from 0 to 100,
# the block scaled to 100 being the block with the greatest count
# Call from DOS prompt:
# C:\Python20>python matchline4.txt >matchline4out.txt
#---------------------------------------------------------------------------
# Extract all words in the file with their location in the file so you can trace
# your way back to where the words came from
# Word location consists of a line number, a character offset within the line,
# and a character length.
#---------------------------------------------------------------------------
import re, string
def append_list(l1,l2):
for item in l2:
l1.append(item)
def find_wordsinline(line,linectr):
# find all matches of the pattern within the string
# return a list of tuples with three items for each match: (match,start,end)
pos = 0
words = []
re_word = re.compile("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",re.IGNORECASE)
match = 1
while (pos < len(line)) and match:
match = re_word.search(line,pos)
if match:
groups = match.groups()
span = match.span()
pos = span[1]
matchrec = (groups[0],linectr,span[0],span[1])
#print "matchrec: ", matchrec
words.append(matchrec)
return words
def extractwordsline(file):
# words with their line and character offset in line so
# you can find your way back to where they come from
infile = open(file,'r')
words = []
linectr = 0
for line in infile.readlines():
newwords = find_wordsinline(line,linectr)
#print "newwords: ", newwords
append_list(words,newwords)
linectr = linectr + 1
infile.close()
return words
#----------------------------------------------------------------------
def load_wordlist(filename):
words = {}
infile = open(filename,'r')
for line in infile.readlines():
word = string.strip(line)
words[word] = 1
infile.close()
return words
def get_block_keywordcounts():
blocksize = 100
words = extractwordsline("c:\\Balzac\\Balzacbooksnofront\\gbsek10.txt")
character_words = load_wordlist("c:\\python20\\characterwords.txt")
interior_words = load_wordlist("c:\\python20\\interiorwords.txt")
food_words = load_wordlist("c:\\python20\\foodwords.txt")
emotion_words = load_wordlist("c:\\python20\\emotionwords.txt")
blockcount = len(words) / blocksize
print "blockcount: ", blockcount
character = food = interior = emotion = 0
for block in range(0,blockcount-1):
character_block = food_block = interior_block = emotion_block = 0
begin = block * blocksize
end = begin + blocksize - 1
for i in range(begin,end):
word = words[i][0]
if character_words.has_key(word):
character_block = character_block + 1
if interior_words.has_key(word):
interior_block = interior_block + 1
if food_words.has_key(word):
food_block = food_block + 1
if emotion_words.has_key(word):
emotion_block = emotion_block + 1
character = character + character_block
interior = interior + interior_block
food = food + food_block
emotion = emotion + emotion_block
print "block:(character:%s,interior:%s,food:%s,emotion:%s)" % (character_block,interior_block,food_block,emotion_block)
print "(character:%s,interior:%s,food:%s,emotion:%s)" % (character,interior,food,emotion)
get_block_keywordcounts()
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|