# matchline4 - finds density of topical keywords in the text

# extract all the words from the file, chunk them into contiguous 
# blocks of 100 words, and count the occurence of words from certain keyword groups
# in each block, then scale the keyword count for each block from 0 to 100,
# the block scaled to 100 being the block with the greatest count

# Call from DOS prompt:
# C:\Python20>python matchline4.txt >matchline4out.txt
#---------------------------------------------------------------------------
# Extract all words in the file with their location in the file so you can trace 
# your way back to where the words came from

# Word location consists of a line number, a character offset within the line,
# and a character length.
#--------------------------------------------------------------------------- 

import re, string

def append_list(l1,l2):
    for item in l2:
        l1.append(item)

def find_wordsinline(line,linectr):
    # find all matches of the pattern within the string
    # return a list of tuples with three items for each match: (match,start,end) 
    pos = 0
    words = []
    re_word = re.compile("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",re.IGNORECASE)
    match = 1
    while (pos < len(line)) and match:
       match = re_word.search(line,pos)
       if match:
          groups = match.groups()
          span = match.span()
          pos = span[1]
          matchrec = (groups[0],linectr,span[0],span[1])
          #print "matchrec: ", matchrec
          words.append(matchrec)
    return words

def extractwordsline(file):
    # words with their line and character offset in line so 
    # you can find your way back to where they come from
    infile = open(file,'r')
    words = []
    linectr = 0
    for line in infile.readlines():
        newwords = find_wordsinline(line,linectr)
        #print "newwords: ", newwords 
        append_list(words,newwords)
        linectr = linectr + 1
    infile.close()
    return words

#----------------------------------------------------------------------
def load_wordlist(filename):
    words = {}
    infile = open(filename,'r')  
    for line in infile.readlines():
        word = string.strip(line)
        words[word] = 1
    infile.close()
    return words

def get_block_keywordcounts():
    blocksize = 100
    words = extractwordsline("c:\\Balzac\\Balzacbooksnofront\\gbsek10.txt")

    character_words = load_wordlist("c:\\python20\\characterwords.txt")
    interior_words  = load_wordlist("c:\\python20\\interiorwords.txt")
    food_words      = load_wordlist("c:\\python20\\foodwords.txt")
    emotion_words   = load_wordlist("c:\\python20\\emotionwords.txt")

    blockcount = len(words) / blocksize
    print "blockcount: ", blockcount
    character = food = interior = emotion = 0
    for block in range(0,blockcount-1):
        character_block = food_block = interior_block = emotion_block = 0
        begin = block * blocksize 
        end   = begin + blocksize - 1
        for i in range(begin,end):
            word = words[i][0]
            if character_words.has_key(word):
               character_block = character_block + 1            
            if interior_words.has_key(word):
               interior_block = interior_block + 1            
            if food_words.has_key(word):
               food_block = food_block + 1            
            if emotion_words.has_key(word):
               emotion_block = emotion_block + 1   
        character = character + character_block
        interior  = interior  + interior_block
        food      = food      + food_block
        emotion   = emotion   + emotion_block
        print "block:(character:%s,interior:%s,food:%s,emotion:%s)" % (character_block,interior_block,food_block,emotion_block)
    print "(character:%s,interior:%s,food:%s,emotion:%s)" % (character,interior,food,emotion)


get_block_keywordcounts()
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)