# filter2 - creates an html table of contents (toc), # strips the chapter name and numbers out of a project gutenberg # text file and writes out html link records for them. # This program is used in conjunction with chaptofile.py to reduce a project # gutenberg text file (specifically a Balzac novel) to a set of html pages # connected with html frames. Prints to standard output. # problems/todo: # 1. multiple line chapter titles are cut, only the first line is taken # 2. problem if the work is divided into parts and has two identical # series of chapters named the same. e.g. I,II,III,IV,V,... # The short term fix is just to manually edit the html produced # and change the chapter titles and when running chaptofile # first split the original novel/text file into two parts, # process them separately and then combine them. # The long term solution is to read the html files produced # by chaptofile and use the html tagged contents of these files # to produce a table of contents (toc). import sys, string, re def print_html_header(): print '' print '' print '' print '' print '' print '' def print_html_footer(): print '' print '' def make_chapter_link(bookname,chapter_number,chapter_name): s1 = '" s4 = ' ' s5 = '
' link = s1 + bookname + s2 + chapter_number + s3 + chapter_number + s4 + chapter_name + s5 return link def bookname(filename): filesplit = string.split(filename,".") book_name = filesplit[0] return book_name def chapter_heading_valid(match_object): matched = match_object.groups() # matched is a list of length 2 if len(matched) != 2: print "matched missing two components" return 0 # first element: "chapter" p = re.compile('chapter',re.IGNORECASE) is_chapter = p.match(matched[0]) if not is_chapter: print "not a chapter" return 0 # second element: regular numeral or integer numeral p = re.compile('\d+|[IVXC]+',re.IGNORECASE) is_numeral = p.match(matched[1]) if not is_numeral: print "not a numeral" return 0 return 1 def print_chapter_headings(filename): re_chapter = re.compile('\s*(chapter\s+(\w+))\s*',re.IGNORECASE) find_name = 0 file = open(filename, 'r') for line in file.readlines(): line = string.strip(line) if find_name and (len(line) > 0): line = string.strip(line) chapter_name = line link = make_chapter_link(bookname(filename),chapter_number,chapter_name) print link chapter_name = '' chapter_number = '' find_name = 0 chapter_heading = re_chapter.match(line) if chapter_heading and chapter_heading_valid(chapter_heading): g = chapter_heading.groups() #print "groups:",g chapter_number = g[1] find_name = 1 #---------------------------------------------------------------------- # main program: # fill a list up with the lines of a file filename = sys.argv[1] #filename = 'stlif10.txt' print print_html_header() print_chapter_headings(filename) print print_html_footer()