#!/bin/bash
# ###############################################################################
#
#
ocube
#
#
*** tesseract made simple ***
#
# ###############################################################################
#
# ocube is a tesseract wrapper to preprocess pictures for OCR
#
# defaults
# all variables are overwritten with the respective option!
TIFF=TRUE # delete tif images after use
TESS=TRUE # delete tesseract data after use
TIFFDIR=/tmp/ # folder for tiff images
TESSDIR=/tmp/ # folder for tesseract data
SILENT= # if TRUE no output on STDOUT or STDERR
CONV= # TRUE=convert extentsively
ERRORLOG=""
# errorlog, if ! empty error
messages go there otherwise STDERR
EX_FILE= # exclude files found in EX_FILE
FILE=""
# if FILE
*not* empty, scanned text (i.e. result) goes to FILE
# execute .conf file, if file exists
if [[ -f /etc/ocube.conf ]]
then
. /etc/ocube.conf
fi
if [[ -z "$1" ]]
then
echo "Usage: $0 [OPTIONS] file1 file2 ...
converts files to tif, scans them with tesseract and outputs the text on STDOUT
OPTIONS:
-t TIF-DIR saves converted TIF images in target-directory TIF-DIR
-o TESS-DIR saves files created by tesseract in TESS-DIR
-i <FILE> define input file (otherwise STDIN)
-f FILE saves
all text output in file FILE (messages on STDOUT)
-s silent; no output on STDOUT
-c convert with
fill white, resize, sigmoidal-contrast, etc..
-l save error
messages in ocube.error.log instead of showing them on STDERR
-L <file> save error messages in <file>
-e exclude
files found in ocube.error.log. Avoids rescaning of files
that were processed already.
-E <file> same as -e but with file <file>
By default, everything (progress-, error-messages and output) will be shown on STDOUT.
Read ocube.info for more information
Scanning takes time... be patient :)
"
exit
fi
# get options...
while getopts ":t:o:i:f:sclL:eE:" Option
do
case $Option in
t ) # save converted tif files in directory
TIFFDIR="$OPTARG"/
TESS=FALSE
;;
o ) # save tesseract files in directory
TESSDIR="$OPTARG"/
TIFF=FALSE
;;
i) # input file
SCANS="$OPTARG"
;;
f ) # save output in file instead STDOUT; me
FILE="$OPTARG"
;;
s ) # silent
SILENT=TRUE
;;
c ) # convert picture extensively
CONV=TRUE
;;
l) # save error messages in ocube.error.log
ERRORLOG="ocube.error.log"
;;
L) # save error messages in <file>
ERRORLOG="$OPTARG"
;;
e ) # exclude files in tes.error.log
EX_FILE="ocube.error.log"
;;
E) # exclude files in <file>
EX_FILE="$OPTARG"
;;
esac
done
shift $(($OPTIND - 1))
# redirections ...
# if SILENT is set, redirect everything to /dev/null
if [[ $SILENT == TRUE ]]
then
exec 2>&1 1>/dev/null
fi
# if FILE (-f) is set, redirect output to FILE
[[ -z $FILE ]] || exec 1>>$FILE
# if ERRORLOG (-L|-l) is set, redirect STDERR to ERRORLOG
[[ -z $ERRORLOG ]] || exec 2>>$ERRORLOG
# check for file's existance
# check, if exclusion-file exists but only when activated, i.e. not empty
if [[ ! -z "$EX_FILE" && ! -f "$EX_FILE" ]] ; then
echo "-E/-e: $EX_FILE: no such file" >&2
exit
fi
# check, if tesseract-folder exists
if [[ ! -d "$TESSDIR" ]] ; then
echo "-o: $TESSDIR: no such folder" >&2
exit
fi
# check, if tif-folder exists
if [[ ! -d "$TIFFDIR" ]] ; then
echo "-t: $TIFFDIR: no such folder" >&2
exit
fi
echo "TIF-output directory=$TIFFDIR" >&2
echo "tesseract output directory=$TESSDIR" >&2
# if there is no specificially mentioned input file (-i) use STDIN
[[ -z "$SCANS" ]] && SCANS="$*"
# exit if no input files present with error message
if [[ -z "$SCANS" ]] ;then
echo "No input file, aborting!" >&2
exit
fi
for i in $SCANS
do
# -e/E: check exclusion file list
if [[ ! -z "$EX_FILE" ]] ; then
if grep -q "$i" "$EX_FILE"
then
continue
fi; fi
I_FILE=${i##*/} #just the filename w/o directory
# check if file exists
if [[ ! -f "$i" ]]
then
echo "$i: file not found" >&2
continue
fi
# check if graphics file can be converted
if ! identify "$i" 1>&2
then
echo "$i: not convertable" >&2
continue
fi
# file ok, let's start
echo "processeing: $i ..." >&2
NEWTIF="$TIFFDIR""${I_FILE%.*}".tif
T_FILE="$TESSDIR""${I_FILE%\.*}"
# converting the graphic file
if [[ $CONV == "TRUE" ]]
then # convert with additional processing
convert "$i" -density 150x150
-resize 200% -fill white -tint 50 -level 20%,80%,1.0
-sigmoidal-contrast 30,50% -sharpen 0x2 -compress none -monochrome
"$NEWTIF" 1>&2
else # convert simply (better for line art)
convert "$i" -density 150x150
-compress none "$NEWTIF" 1>&2
fi
# scanning the newly created tif
tesseract "$NEWTIF" "$T_FILE" 1>&2
# output scanned text
cat "$T_FILE".txt
# delete graphic file after use, if not unset in option
if [[ $TESS == "TRUE" ]]
then
rm "$NEWTIF"
fi
# delete tesseract output
if [[ $TIFF == "TRUE" ]]
then
rm "$T_FILE".map "$T_FILE".raw "$T_FILE".txt
fi
done