# chaptofile3 - # splits a project gutenberg prose text file into separate *HTML* files # for each chapter (combines tohtml3.py and chaptofile.py) # 1. adds id to paragraph tag:

# 2. also added a style tag making the font of the main book text a # little smaller to the head of the html page # problem/todo: # problem if the work is divided into parts and has identical # series of chapters named the same. e.g. I,II,III,IV,V,... # the easiest way to get around this is to split the original # novel/text file into the separate parts of which it is composed, # give them separate file names and process them separately. #!/usr/bin/env python import sys, string, re #-------------------------------------------------------------- # html: def normalize_paragraph_breaks(line): p = re.compile('[ \t]+\n') return p.sub('\n',line) def print_html_header(outfile): outfile.write('\n') outfile.write('\n') outfile.write('\n') outfile.write('\n') outfile.write('\n') # ADDED outfile.write('\n') def print_html_footer(outfile): outfile.write('\n') outfile.write('\n') def print_tagged_para(lines,filename,para_counter,outfile): if len(lines) > 0: para = string.join(lines) filesplit = string.split(filename,".") fileprefix = filesplit[0] name_tag = "" % (fileprefix, para_counter) para_tag = "

\n" % (fileprefix, para_counter) # ADDED outfile.write('\n\n') outfile.write(name_tag) #outfile.write('\n

') outfile.write(para_tag) # ADDED outfile.write(para) outfile.write('

') def filelines_to_html(lines_in_file,filename): lines = [] #filename = file + '.html' outfile = open(filename, 'w') print_html_header(outfile) para_counter = 0 for line in lines_in_file: normalize_paragraph_breaks(line) line = string.lstrip(line) if len(line) > 0: line = line + ' ' lines.append(line) else: para_counter = para_counter + 1 print_tagged_para(lines,filename,para_counter,outfile) lines = [] print_html_footer(outfile) outfile.close() #------------------------------------------------------------- # chapter splitting: def chapter_heading_valid(match_object): matched = match_object.groups() # matched is a list of length 2 if len(matched) != 2: print "matched missing two components" return 0 # first element: "chapter" p = re.compile('chapter',re.IGNORECASE) is_chapter = p.match(matched[0]) if not is_chapter: print "not a chapter" return 0 # second element: regular numeral or integer numeral p = re.compile('\d+|[IVXC]+',re.IGNORECASE) is_numeral = p.match(matched[1]) if not is_numeral: print "not a numeral" return 0 return 1 def write_chapter(lines,chapter_number,infilename): # open output file with name book_name+"_"+chapter_number filesplit = string.split(infilename,".") book_name = filesplit[0] #print "book_name:",book_name,"chapter_number:",chapter_number outfilename = "%s_%s.txt" % (book_name, chapter_number) #print "outfilename:",outfilename outfile = open(outfilename, 'w') outfile.writelines(lines) outfile.close() def make_chapter_filename(chapter_number,infilename): # open output file with name book_name+"_"+chapter_number filesplit = string.split(infilename,".") book_name = filesplit[0] #print "book_name:",book_name,"chapter_number:",chapter_number chapter_filename = "%s_%s.html" % (book_name, chapter_number) #print "chapter_filename:",chapter_filename return chapter_filename def tochapterfiles(filename): prev_chapter_number = "" lines = [] file = open(filename, 'r') for line in file.readlines(): re_chapter = re.compile('\s*(chapter\s+(\w+))\s*',re.IGNORECASE) chapter_heading = re_chapter.match(line) if chapter_heading and chapter_heading_valid(chapter_heading): g = chapter_heading.groups() #print "groups:",g chapter_number = chapter_heading.group(2) #print "chapter_number:",chapter_number chapter_filename = make_chapter_filename(prev_chapter_number,filename) filelines_to_html(lines,chapter_filename) #write_chapter(lines,prev_chapter_number,filename) prev_chapter_number = chapter_number lines = [] lines.append(line) #print line chapter_filename = make_chapter_filename(prev_chapter_number,filename) filelines_to_html(lines,chapter_filename) #write_chapter(lines,prev_chapter_number,filename) #---------------------------------------------------------------------- # main program: # fill a list up with the lines of a file filename = sys.argv[1] tochapterfiles(filename)