# matchline - 
# Extract all words in the file with their location in the file so you can trace 
# your way back to where the words came from

# Word location consists of a line number, a character offset within the line,
# and a character length.
#--------------------------------------------------------------------------- 

import re

def append_list(l1,l2):
    for item in l2:
        l1.append(item)

def find_wordsinline(line,linectr):
    # find all matches of the pattern within the string
    # return a list of tuples with three items for each match: (match,start,end) 
    pos = 0
    words = []
    re_word = re.compile("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",re.IGNORECASE)
    match = 1
    while (pos < len(line)) and match:
       match = re_word.search(line,pos)
       if match:
          groups = match.groups()
          span = match.span()
          pos = span[1]
          matchrec = (groups[0],linectr,span[0],span[1])
          #print "matchrec: ", matchrec
          words.append(matchrec)
    return words

def extractwordsline(file):
    # words with their line and character offset in line so 
    # you can find your way back to where they come from
    infile = open(file,'r')
    words = []
    linectr = 0
    for line in infile.readlines():
        newwords = find_wordsinline(line,linectr)
        print "newwords: ", newwords 
        append_list(words,newwords)
        linectr = linectr + 1
    infile.close()
    return words

words = extractwordsline("c:\\Balzac\\Balzacbooksnofront\\gbsek10.txt")
Text file Source (historic): geocities.com/soho/square/3472

geocities.com/soho/square
geocities.com/soho

(to report bad content: archivehelp @ gmail)