# matchline -
# Extract all words in the file with their location in the file so you can trace
# your way back to where the words came from
# Word location consists of a line number, a character offset within the line,
# and a character length.
#---------------------------------------------------------------------------
import re
def append_list(l1,l2):
for item in l2:
l1.append(item)
def find_wordsinline(line,linectr):
# find all matches of the pattern within the string
# return a list of tuples with three items for each match: (match,start,end)
pos = 0
words = []
re_word = re.compile("(\w+\'\w+|\w+\-\w+|\w+)[\s,.!?;]",re.IGNORECASE)
match = 1
while (pos < len(line)) and match:
match = re_word.search(line,pos)
if match:
groups = match.groups()
span = match.span()
pos = span[1]
matchrec = (groups[0],linectr,span[0],span[1])
#print "matchrec: ", matchrec
words.append(matchrec)
return words
def extractwordsline(file):
# words with their line and character offset in line so
# you can find your way back to where they come from
infile = open(file,'r')
words = []
linectr = 0
for line in infile.readlines():
newwords = find_wordsinline(line,linectr)
print "newwords: ", newwords
append_list(words,newwords)
linectr = linectr + 1
infile.close()
return words
words = extractwordsline("c:\\Balzac\\Balzacbooksnofront\\gbsek10.txt")
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|