/******************************************************************************
* Parser to convert roman transliterations into Unicode Bengali
* Copyright (C) 2003 Roshan Kamath
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
******************************************************************************/
/******************************************************************************
* BEGIN DEFINITIONS
******************************************************************************/
/*
** First the Local Defines that will be used in the program
*/
#define VIRAM "্"
#define ZWNJ "" /* Zero Width Non Joiner */
#define ZWJ "" /* Zero Width Joiner */
/*
** Prototype Declarations
*/
int isVowel(int);
int isConsonant(int);
void addHamza(void);
void addViram(void);
void reduceVowel(void);
void encode(char*);
void convh2s();
void adjustDanda();
void correctAnusvar();
/*
** Use a "flag" to detect when to do Transliteration from English to UCS
** This global flag is set whenever we detect a tag and unset upon a
*/
int transliterate = 0;
/*
** Use a "flag" to detect when to use the ZWNJ. This global flag is toggled
** everytime we encounter the '\'. Effectively, text enclosed within the
** '\' will not use the ZWNJ. Note that the default value is 1!
*/
int useNonJoiner = 1;
/*
** Use a flag to detect if we should add the implicit virAms. This flag is set
** upon encountering the tag and unset upon a
*/
int noViram = 0;
/******************************************************************************
* END DEFINITIONS
******************************************************************************/
%%
/******************************************************************************
* BEGIN RULES
******************************************************************************/
/*
** Define token to Skip trailing white spaces
*/
[ \t]+$
/*
** Replace white spaces by a single space. Add a Hamza (') if a Vowel
** follows the white spaces
*/
[ \t\r]+ printf(" "); addHamza();
/*
** Detect when to start and stop transliteration. Add a Hamza (')
** if a Vowel follows
*/
"" transliterate = 1; addHamza(); /* Set to TRUE */
"" transliterate = 0; /* Set to FALSE */
/*
** Detect when to start and stop adding the implicit virAms.
*/
"" noViram = 1; /* Set to TRUE */
"" noViram = 0; /* Set to FALSE */
/*
** ECHO all HTML tags. Add a Hamza (') if a Vowel follows the tags
*/
\<[^\>]*\> ECHO; addHamza();
/*
** Detect when to use the ZWNJ
*/
\\ useNonJoiner = !useNonJoiner;
/*
** Cover all punctuation marks. Update this list regularly :)
*/
[-~!@#$%&*\(\)_+=\{\}\[\];\"\<,?/] ECHO; addHamza();
/*
** Special Cases because UCS doesn't have fonts for these or these are
** special symbols that I use in context
*/
` {
if (!transliterate)
ECHO;
else {
/* Use .a for the Glottal Stop */
addHamza(); unput('a'); unput('.');
}
}
w {
/* Can't use v.d for w because that is same as r! */
if (transliterate) {
unput('v');
} else
ECHO;
}
/*
** Begin UCS Encodings here 0x0980 to 0x09FF. Note that the encode function
** uses the transliterate flag to determine whether to encode or not. This
** is also true for the other helpers like addViram(), addHamza() etc.
**
** In case of a consonant, we add a virAm (.h) in case it is not followed by
** a vowel. This can be inhibited by the tags.
**
** For a vowel, we convert any following .h to .s This is only to maintain
** backward compatibility with my personal Jtrans convention. [This has
** however been temporariy disabled.]
**
** Note: Since UCS does not have the short versions of the aE and aO vowels
** we represent the short versions by using the _ accent! To maintain
** consistency, even the short version of E and O are rendered similarly.
**
** Finally, we eliminate the 'a' vowel directly since the default glyph
** already had the vowel implicitly present. Ofcourse, this is not done
** blindly :)
*/
\.n encode("ঁ");
/* 2434 exists, but I like to change the .N to .N^k or .N^j */
\.N correctAnusvar();
/* 2435 is the ':' But I shall use the standard ASCII version */
/* 2436 is a Hole */
'a encode("অ"); convh2s();
'A encode("আ"); convh2s();
'i encode("ই");
'I encode("ঈ"); convh2s();
'u encode("উ");
'U encode("ঊ"); convh2s();
'\.r encode("ঋ");
'\.l encode("ঌ");
/* 2445 is not used */
/* 2446 is not used */
'e { /* Use 'E for 'e */
if (transliterate) {
unput('E'); unput('\'');
} else
ECHO;
}
'E encode("এ"); convh2s();
/* Use 'aE for 'ae */
'ae {
if (transliterate) {
unput('E'); unput('a'); unput('\'');
} else
ECHO;
}
'aE encode("ঐ"); convh2s();
/* 2449 is not used */
/* 2450 is not used */
'o { /* Use 'O for 'o */
if (transliterate) {
unput('O'); unput('\'');
} else
ECHO;
}
'O encode("ও"); convh2s();
/* Use 'aO for 'ao */
'ao {
if (transliterate) {
unput('O'); unput('a'); unput ('\'');
} else
ECHO;
}
'aO encode("ঔ"); convh2s();
k encode("ক"); addViram();
kH encode("খ"); addViram();
g encode("গ"); addViram();
gH encode("ঘ"); addViram();
\.N^k encode("ঙ"); addViram();
Ch encode("চ"); addViram();
ChH encode("ছ"); addViram();
j encode("জ"); addViram();
jH encode("ঝ"); addViram();
\.N^j encode("ঞ"); addViram();
T encode("ট"); addViram();
TH encode("ঠ"); addViram();
D encode("ড"); addViram();
DH encode("ঢ"); addViram();
N encode("ণ"); addViram();
t encode("ত"); addViram();
tH encode("থ"); addViram();
d encode("দ"); addViram();
dH encode("ধ"); addViram();
n encode("ন"); addViram();
/* 2473 is not used */
p encode("প"); addViram();
pH encode("ফ"); addViram();
b encode("ব"); addViram();
bH encode("ভ"); addViram();
m encode("ম"); addViram();
y encode("য"); addViram();
r encode("র"); addViram();
/* 2481 is not used */
l encode("ল"); addViram();
/* L is represented by l.d - my convention */
L {
if (!transliterate)
ECHO;
else {
encode("ল"); encode ("়"); addViram();
}
}
/* 2483 is not used */
/* 2484 is not used */
/* 2485 is not used */
/* 2356 is not used */
/* v cannot be represented as b.d because that is same as r! */
v encode("ব"); addViram(); /* Same as b */
Sh encode("শ"); addViram();
Xh encode("ষ"); addViram();
s encode("স"); addViram();
h encode("হ"); addViram();
/* 2490 is a Hole */
/* 2491 is a Hole */
\.d encode("়"); addViram(); /* This is usually NEVER part of external
input */
/* The standard bengali fonts don't support the avagraha (ঽ) yet
so just skip the avagraha for now */
\.a
a {
if (!transliterate) {
ECHO;
} else {
/* Consume any 'a' symbols judiciously since they are irrelevant
in UCS */
adjustDanda();
convh2s(); /* In case there is a .h following */
}
}
A encode("া"); convh2s();
i encode("ি");
I encode("ী"); convh2s();
u encode("ু");
U encode("ূ"); convh2s();
\.r encode("ৃ");
\.R encode("ৄ"); convh2s();
/* 2501 is not used */
/* 2502 is not used */
e { /* Use E for e */
if (transliterate) {
unput('E');
} else
ECHO;
}
E encode("ে"); convh2s();
/* Use aE for ae */
ae {
if (transliterate) {
unput('E'); unput('a');
} else
ECHO;
}
aE encode("ৈ"); convh2s();
/* 2505 is not used */
/* 2506 is not used */
o { /* Use O for o */
if (transliterate) {
unput('O');
} else
ECHO;
}
O encode("ো"); convh2s();
/* Use aO for ao */
ao {
if (transliterate) {
unput('O'); unput('a');
} else
ECHO;
}
aO encode("ৌ"); convh2s();
\.h {
if (transliterate) {
printf("%s",VIRAM);
/* Now add the NonJoiner only if it is not inhibited */
if (useNonJoiner) printf("%s",ZWNJ);
} else
ECHO;
}
/* While we could borrow the Devnagari codepoint for .s it is useless
since none of the Bengali consonants work with it */
\.s
/* 2510 is a Hole */
/* 2511 is a Hole */
/* 2512 is a Hole */
/* 2513 is a Hole */
/* 2514 is a Hole */
/* 2515 is a Hole */
/* 2516 is a Hole */
/* 2517 is a Hole */
/* 2518 is a Hole */
/* 2519 is not used */
/* q is same as k.d */
q {
if (!transliterate)
ECHO;
else {
encode("ক"); encode("়"); addViram();
}
}
/* Kh is same as kH.d */
Kh {
if (!transliterate)
ECHO;
else {
encode("খ"); encode("়"); addViram();
}
}
/* Gh is same as gH.d */
Gh {
if (!transliterate)
ECHO;
else {
encode("ঘ"); encode("়"); addViram();
}
}
/* z is same as j.d */
z {
if (!transliterate)
ECHO;
else {
encode("জ"); encode("়"); addViram();
}
}
R encode("ড়"); addViram();
RH encode("ঢ়"); addViram();
/* 2526 is a Hole */
/* 2527 is not used (atleast in urdU) */
/* f is pH.d */
f {
if (!transliterate)
ECHO;
else {
encode("ফ"); encode("়"); addViram();
}
}
'\.R encode("ৠ"); convh2s();
'\.L encode("ৡ"); convh2s();
\.l encode("ৢ");
\.L encode("ৣ"); convh2s();
\| encode("।"); /* Borrowed from Devnagari */
\|\| encode("॥"); /* Borrowed from Devnagari */
/* 2532 is a Hole */
/* 2533 is a Hole */
0 encode("০");
1 encode("১");
2 encode("২");
3 encode("৩");
4 encode("৪");
5 encode("৫");
6 encode("৬");
7 encode("৭");
8 encode("৮");
9 encode("৯");
\. encode("॰"); /* Borrowed from Devnagari */
/******************************************************************************
* END RULES
******************************************************************************/
%%
/******************************************************************************
* BEGIN USER SUBROUTINES
******************************************************************************/
/*
** Define the yywrap to be an empty function
*/
int yywrap(void) { return 1; }
/*
** This function checks if the given character is possibly the beginning of a
** dependent vowel
*/
int isVowel(int c) {
int d;
switch (c) {
case 'a':
case 'A':
case 'i':
case 'I':
case 'u':
case 'U':
case 'e':
case 'E':
case 'o':
case 'O':
return 1;
case '.':
/* We need further look ahead */
d = input();
if (d >= 0) {
unput(d); // Undo the lookahead
switch(d) {
case 'r':
case 'R':
case 'l':
case 'L':
return 1;
default:
return 0;
}
}
/* Fall through */
default:
return 0;
}
}
/*
** This function checks if the following character is the beginning of a
** consonant
*/
int isConsonant(int c) {
int d;
switch(c) {
case '`':
case 'w':
case '\'':
case 'k':
case 'g':
case 'C':
case 'j':
case 'T':
case 'D':
case 'N':
case 't':
case 'd':
case 'n':
case 'p':
case 'b':
case 'm':
case 'y':
case 'r':
case 'l':
case 'L':
case 'v':
case 'S':
case 'X':
case 's':
case 'h':
case 'H': /* As 'H' is an implicit consonant */
case 'q':
case 'K':
case 'G':
case 'z':
case 'Z':
case 'R':
case 'f':
return 1;
case '.':
/* We need further look ahead */
d = input();
if (d >= 0) {
unput(d); // Undo the lookahead
switch(d) {
case 'N':
case 'a':
return 1;
default:
return 0;
}
}
/* Fall through */
default:
return 0;
}
}
/*
** This function adds a Hamza (') if the following character(s) is a vowel only
** if the transliterate flag is set
*/
void addHamza() {
if (!transliterate) return;
int c = input(); // Attempt Lookahead
if (c >= 0) {
/* Note that the unput(c) cannot be moved out of the if block
This is because the isVowel() is going to do lookahead too! */
if (isVowel(c)) {
unput(c); // Undo the lookahead
unput('\'');
} else {
unput(c); // Undo the lookahead
}
}
}
/*
** This function adds a virAm (.h) if the following character(s) is not a vowel.
** Note that a Zero-Width Non Joiner may also be inserted. All this only if the
** transliterate flag is set
*/
void addViram() {
if (!transliterate) return;
// If we are in a noViram zone, just return
if (noViram) return;;
int c = input(); // Attempt Lookahead
if (c >= 0) {
/* Note that the unput(c) cannot be moved above the if block
This is because the isVowel() is going to do lookahead too! */
if (!isVowel(c)) {
/* It could be a ".d" in which case the VIRAM decision is
delayed to beyond the ".d" */
if (c == '.') {
int d = input();
if (d >= 0) {
unput(d);
if (d == 'd') {
unput(c);
return;
}
}
}
printf("%s",VIRAM);
/* Now check if the char c was a 'H'
If not, put the ZWNJ */
if (c != 'H') {
/* Put the ZWNJ only if the flag is set */
if (useNonJoiner) printf("%s",ZWNJ);
unput(c); // Undo the lookahead
} else {
/* Replace this now solitary 'H' by 'h' */
unput('h');
}
} else {
unput(c); // Undo the lookahead
}
} else {
printf("%s",VIRAM); // What else can we do?
}
}
/*
** This functions adds a .s (presumably after a long vowel) to indicate a vowel
** grade reduction. If a .n follows the vowel, the .s is added after the .n
*/
void reduceVowel() {
int dot = input(); // Look ahead for a .n
if (dot >= 0) {
if (dot == '.') {
int n = input();
if (n >=0){
if (n == 'n') {
unput('s');
unput('.');
unput(n);
unput(dot);
return;
}
unput(n);
}
}
unput(dot);
}
unput('s');
unput('.');
}
/*
** This function corrects the .N to one of .N^k, .N^j, N, n, and m depending
** on the following consonant
*/
void correctAnusvar() {
if (!transliterate) return;
int c = input(); // Look ahead for the following consonant;
unput(c); // Restore lookahead
switch (c) {
case 'k':
case 'g':
/* Replace by .N^k */
unput('k');
unput('^');
unput('N');
unput('.');
break;
case 'C':
case 'j':
/* Replace by .N^j */
unput('j');
unput('^');
unput('N');
unput('.');
break;
case 't':
case 'd':
case 'n':
/* Replace by n */
unput('n');
break;
case 'T':
case 'D':
case 'N':
/* Replace by N */
unput('N');
break;
case 'p':
case 'b':
case 'm':
/* Replace by m */
unput('m');
break;
}
}
/*
** This function decides if we should retain the 'a' that was scanned already
** or not. The 'a' is removed if the following character is a consonant or '.s'
** or '.h' or '. ' or generic delimiters. Alternatively, it is retained if we
** see a vowel or a '.d'. Note, this may need to be modified based upon future
** additions since this behavior is quite arbitrary. Retaining the 'a' is only
** a way of flagging potential bugs in the input text
*/
void adjustDanda() {
int c = input();
if (c >= 0) {
if (isVowel(c)) {
ECHO; // Retain the 'a' in the output
} else if (c == '.') {
int d = input(); // Lookahead for a d
if (d >= 0) {
unput(d);
if (d == 'd') {
ECHO; // Retain the 'a' in the output
}
}
}
unput(c);
}
}
/*
** This function prints the incoming string to stdout if the transliterate flag
** is set. Else just send the scanned token as is to stdout
*/
void encode(char* utfValue) {
transliterate ? printf("%s",utfValue) : ECHO;
}
/*
** The convh2s is a function deliberately PUT here for backward compatibility
** purposes (so that my UCS scheme is compatible with my Jtrans scheme)
** It replaces any .h following a vowel to a .s
**
** The Above was the ORIGINAL intention of using this function. Currently,
** though this is modified to just ignore any .h following a vowel.
*/
void convh2s() {
if (!transliterate) return;
int dot = input();
int h;
if (dot >= 0) {
if (dot == '.') {
/* Look for a further 'h' */
h = input();
if (h >= 0) {
if (h == 'h') {
return;
#if 0
unput('s');
#endif
} else {
unput(h);
}
}
}
unput(dot);
}
}
/******************************************************************************
* END USER SUBROUTINES
******************************************************************************/
               (
geocities.com/roshbaby/UCS)                   (
geocities.com/roshbaby)