# filter2 - creates an html table of contents (toc),
# strips the chapter name and numbers out of a project gutenberg
# text file and writes out html link records for them.
# This program is used in conjunction with chaptofile.py to reduce a project
# gutenberg text file (specifically a Balzac novel) to a set of html pages
# connected with html frames. Prints to standard output.
# problems/todo:
# 1. multiple line chapter titles are cut, only the first line is taken
# 2. problem if the work is divided into parts and has two identical
# series of chapters named the same. e.g. I,II,III,IV,V,...
# The short term fix is just to manually edit the html produced
# and change the chapter titles and when running chaptofile
# first split the original novel/text file into two parts,
# process them separately and then combine them.
# The long term solution is to read the html files produced
# by chaptofile and use the html tagged contents of these files
# to produce a table of contents (toc).
import sys, string, re
def print_html_header():
print ''
print '
'
print ''
print ''
print ''
print ''
def print_html_footer():
print ''
print ''
def make_chapter_link(bookname,chapter_number,chapter_name):
s1 = '"
s4 = ' '
s5 = '
'
link = s1 + bookname + s2 + chapter_number + s3 + chapter_number + s4 + chapter_name + s5
return link
def bookname(filename):
filesplit = string.split(filename,".")
book_name = filesplit[0]
return book_name
def chapter_heading_valid(match_object):
matched = match_object.groups()
# matched is a list of length 2
if len(matched) != 2:
print "matched missing two components"
return 0
# first element: "chapter"
p = re.compile('chapter',re.IGNORECASE)
is_chapter = p.match(matched[0])
if not is_chapter:
print "not a chapter"
return 0
# second element: regular numeral or integer numeral
p = re.compile('\d+|[IVXC]+',re.IGNORECASE)
is_numeral = p.match(matched[1])
if not is_numeral:
print "not a numeral"
return 0
return 1
def print_chapter_headings(filename):
re_chapter = re.compile('\s*(chapter\s+(\w+))\s*',re.IGNORECASE)
find_name = 0
file = open(filename, 'r')
for line in file.readlines():
line = string.strip(line)
if find_name and (len(line) > 0):
line = string.strip(line)
chapter_name = line
link = make_chapter_link(bookname(filename),chapter_number,chapter_name)
print link
chapter_name = ''
chapter_number = ''
find_name = 0
chapter_heading = re_chapter.match(line)
if chapter_heading and chapter_heading_valid(chapter_heading):
g = chapter_heading.groups()
#print "groups:",g
chapter_number = g[1]
find_name = 1
#----------------------------------------------------------------------
# main program:
# fill a list up with the lines of a file
filename = sys.argv[1]
#filename = 'stlif10.txt'
print print_html_header()
print_chapter_headings(filename)
print print_html_footer()