# makewordlists -
# calls a C program to do create a list of every word in the file with the line
# number that it occurs on. Python with regexes was too slow.
# c:\\python20\wordlist.exe c:\\balzac\\adieu10.txt c:\\balzac\\todir\\
#=================================================================
import re, sys, string, glob, os.path, os
def make_wordlists(text_dir,wordlist_dir):
# get all the files in the directory
program = "c:\\python20\wordlist.exe"
allfiles = text_dir + '*.*'
fileset = glob.glob(allfiles)
# iterate over each file, create a dos command, execute it
for text in fileset:
pathsplit = os.path.split(text)
filename = pathsplit[1]
wordlist = wordlist_dir + filename
cmd = "%s %s >%s" % (program,text,wordlist)
print cmd
os.system(cmd)
#----------------------------------------------------------------------
# main program:
if __name__ == "__main__":
if len(sys.argv) != 3:
#print "Usage: ", sys.argv[0], "directory"
text_dir = "c:\\Balzac\\balzacbooksnofront\\"
wordlist_dir = "c:\\Balzac\\balzacbooksnofront\\wordlists\\"
wordcounts = make_wordlists(text_dir,wordlist_dir)
else:
text_dir = sys.argv[1]
wordlist_dir = sys.argv[2]
wordcounts = make_wordlists(text_dir,wordlist_dir)
Text file Source (historic): geocities.com/soho/square/3472
geocities.com/soho/squaregeocities.com/soho
(to report bad content: archivehelp @ gmail)
|
|
|
|
|