/******************************************************************************
 * Parser to convert roman transliterations into Unicode Bengali
 * Copyright (C) 2003 Roshan Kamath
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 ******************************************************************************/

 /******************************************************************************
 * BEGIN DEFINITIONS
 ******************************************************************************/

 /*
 ** First the Local Defines that will be used in the program
 */
 #define VIRAM	"্"
 #define ZWNJ	"‌"		/* Zero Width Non Joiner */
 #define ZWJ	"‍"		/* Zero Width Joiner */

 /*
 ** Prototype Declarations
 */
 int isVowel(int);
 int isConsonant(int);
 void addHamza(void);
 void addViram(void);
 void reduceVowel(void);
 void encode(char*);
 void convh2s();
 void adjustDanda();
 void correctAnusvar();

 /*
 ** Use a "flag" to detect when to do Transliteration from English to UCS
 ** This global flag is set whenever we detect a  tag and unset upon a 
 */
 int transliterate = 0;

 /*
 ** Use a "flag" to detect when to use the ZWNJ. This global flag is toggled
 ** everytime we encounter the '\'. Effectively, text enclosed within the
 ** '\' will not use the ZWNJ. Note that the default value is 1!
 */
 int useNonJoiner = 1;

 /*
 ** Use a flag to detect if we should add the implicit virAms. This flag is set
 ** upon encountering the  tag and unset upon a 
 */
 int noViram = 0;


 /******************************************************************************
 * END DEFINITIONS
 ******************************************************************************/


%%
 /******************************************************************************
 * BEGIN RULES
 ******************************************************************************/

	/*
	** Define token to Skip trailing white spaces
	*/
[ \t]+$

	/*
	** Replace white spaces by a single space. Add a Hamza (') if a Vowel
	** follows the white spaces
	*/
[ \t\r]+	printf(" "); addHamza();


	/*
	** Detect when to start and stop transliteration. Add a Hamza (')
	** if a Vowel follows 
	*/
""		transliterate = 1; addHamza(); /* Set to TRUE */
""		transliterate = 0; /* Set to FALSE */

	/*
	** Detect when to start and stop adding the implicit virAms.
	*/
""		noViram = 1; /* Set to TRUE */
""	noViram = 0; /* Set to FALSE */

	/* 
	** ECHO all HTML tags. Add a Hamza (') if a Vowel follows the tags
	*/
\<[^\>]*\>	ECHO; addHamza();

	/*
	** Detect when to use the ZWNJ
	*/
\\		useNonJoiner = !useNonJoiner;

	/*
	** Cover all punctuation marks. Update this list regularly :)
	*/
[-~!@#$%&*\(\)_+=\{\}\[\];\"\<,?/]		ECHO; addHamza();

	/*
	** Special Cases because UCS doesn't have fonts for these or these are
	** special symbols that I use in context
	*/
`	{
		if (!transliterate)
			ECHO;
		else {
			/* Use .a for the Glottal Stop */
			addHamza(); unput('a'); unput('.'); 
		}
	}
w	{
		/* Can't use v.d for w because that is same as r! */
		if (transliterate) {
			unput('v'); 
		} else
			ECHO;
	}

	/*
	** Begin UCS Encodings here 0x0980 to 0x09FF. Note that the encode function
	** uses the transliterate flag to determine whether to encode or not. This
	** is also true for the other helpers like addViram(), addHamza() etc.
	**
	** In case of a consonant, we add a virAm (.h) in case it is not followed by
	** a vowel. This can be inhibited by the  tags.
	**
	** For a vowel, we convert any following .h to .s This is only to maintain
	** backward compatibility with my personal Jtrans convention. [This has
	** however been temporariy disabled.]
	**
	** Note: Since UCS does not have the short versions of the aE and aO vowels
	** we represent the short versions by using the _ accent! To maintain
	** consistency, even the short version of E and O are rendered similarly.
	**
	** Finally, we eliminate the 'a' vowel directly since the default glyph
	** already had the vowel implicitly present. Ofcourse, this is not done
	** blindly :)
	*/
\.n		encode("ঁ");
		/* 2434 exists, but I like to change the .N to .N^k or .N^j */
\.N		correctAnusvar();
		/* 2435 is the ':' But I shall use the standard ASCII version */
		/* 2436 is a Hole */
'a		encode("অ"); convh2s();
'A		encode("আ"); convh2s();
'i		encode("ই");
'I		encode("ঈ"); convh2s();
'u		encode("উ");
'U		encode("ঊ"); convh2s();
'\.r	encode("ঋ");
'\.l	encode("ঌ");
		/* 2445 is not used */
		/* 2446 is not used */
'e		{	/* Use 'E for 'e */
			if (transliterate) {
				unput('E'); unput('\'');
			} else
				ECHO;
		}
'E		encode("এ"); convh2s();
		/* Use 'aE for 'ae */
'ae		{
			if (transliterate) {
				unput('E'); unput('a'); unput('\'');
			} else
				ECHO;
		}
'aE		encode("ঐ"); convh2s();
		/* 2449 is not used */
		/* 2450 is not used */
'o		{	/* Use 'O for 'o */
			if (transliterate) {
				unput('O'); unput('\'');
			} else
				ECHO;
		}
'O		encode("ও"); convh2s();
		/* Use 'aO for 'ao */
'ao		{
			if (transliterate) {
				unput('O'); unput('a'); unput ('\'');
			} else
				ECHO;
		}
'aO		encode("ঔ"); convh2s();

k		encode("ক"); addViram();
kH		encode("খ"); addViram();
g		encode("গ"); addViram();
gH		encode("ঘ"); addViram();
\.N^k	encode("ঙ"); addViram();

Ch		encode("চ"); addViram();
ChH		encode("ছ"); addViram();
j		encode("জ"); addViram();
jH		encode("ঝ"); addViram();
\.N^j	encode("ঞ"); addViram();

T		encode("ট"); addViram();
TH		encode("ঠ"); addViram();
D		encode("ড"); addViram();
DH		encode("ঢ"); addViram();
N		encode("ণ"); addViram();

t		encode("ত"); addViram();
tH		encode("থ"); addViram();
d		encode("দ"); addViram();
dH		encode("ধ"); addViram();
n		encode("ন"); addViram();
		/* 2473 is not used */

p		encode("প"); addViram();
pH		encode("ফ"); addViram();
b		encode("ব"); addViram();
bH		encode("ভ"); addViram();
m		encode("ম"); addViram();

y		encode("য"); addViram();
r		encode("র"); addViram();
		/* 2481 is not used */
l		encode("ল"); addViram();
		 /* L is represented by l.d - my convention */
L		{
			if (!transliterate)
				ECHO;
			else {
				encode("ল"); encode ("়"); addViram();
			}
		}
		/* 2483 is not used */
		/* 2484 is not used */
		/* 2485 is not used */

		/* 2356 is not used */
		/* v cannot be represented as b.d because that is same as r! */
v		encode("ব"); addViram(); /* Same as b */
Sh		encode("শ"); addViram();
Xh		encode("ষ"); addViram();
s		encode("স"); addViram();
h		encode("হ"); addViram();
		/* 2490 is a Hole */
		/* 2491 is a Hole */
\.d		encode("়"); addViram(); /* This is usually NEVER part of external
										  input */
		/* The standard bengali fonts don't support the avagraha (ঽ) yet
		   so just skip the avagraha for now */
\.a

a		{
			if (!transliterate) {
				ECHO; 
			} else {
				/* Consume any 'a' symbols judiciously since they are irrelevant
				   in UCS */
				adjustDanda();
				convh2s(); /* In case there is a .h following */
			}
		}
A		encode("া"); convh2s();
i		encode("ি"); 
I		encode("ী"); convh2s();
u		encode("ু");
U		encode("ূ"); convh2s();
\.r		encode("ৃ");
\.R		encode("ৄ"); convh2s();
		/* 2501 is not used */
		/* 2502 is not used */
e		{	/* Use E for e */
			if (transliterate) {
				unput('E');
			} else
				ECHO;
		}
E		encode("ে"); convh2s();
		/* Use aE for ae */
ae		{
			if (transliterate) {
				unput('E'); unput('a');
			} else
				ECHO;
		}
aE		encode("ৈ"); convh2s();
		/* 2505 is not used */
		/* 2506 is not used */
o		{	/* Use O for o */
			if (transliterate) {
				unput('O');
			} else
				ECHO;
		}
O		encode("ো"); convh2s();
		/* Use aO for ao */
ao		{
			if (transliterate) {
				unput('O'); unput('a');
			} else
				ECHO;
		}
aO		encode("ৌ"); convh2s();
\.h		{
			if (transliterate) {
				printf("%s",VIRAM);
				/* Now add the NonJoiner only if it is not inhibited */
				if (useNonJoiner) printf("%s",ZWNJ);
			} else
				ECHO;
		}
		/* While we could borrow the Devnagari codepoint for .s it is useless
		   since none of the Bengali consonants work with it */
\.s		

		/* 2510 is a Hole */
		/* 2511 is a Hole */
		/* 2512 is a Hole */
		/* 2513 is a Hole */
		/* 2514 is a Hole */
		/* 2515 is a Hole */
		/* 2516 is a Hole */
		/* 2517 is a Hole */
		/* 2518 is a Hole */
		/* 2519 is not used */
		/* q is same as k.d */
q		{
			if (!transliterate)
				ECHO;
			else {
				encode("ক"); encode("়"); addViram();
			}
		}
		/* Kh is same as kH.d */
Kh		{
			if (!transliterate)
				ECHO;
			else {
				encode("খ"); encode("়"); addViram();
			}
		}
		/* Gh is same as gH.d */
Gh		{
			if (!transliterate)
				ECHO;
			else {
				encode("ঘ"); encode("়"); addViram();
			}
		}
		/* z is same as j.d */
z		{
			if (!transliterate)
				ECHO;
			else {
				encode("জ"); encode("়"); addViram();
			}
		}
R		encode("ড়"); addViram();
RH		encode("ঢ়"); addViram();
		/* 2526 is a Hole */
		/* 2527 is not used (atleast in urdU) */

		/* f is pH.d */
f		{
			if (!transliterate)
				ECHO;
			else {
				encode("ফ"); encode("়"); addViram();
			}
		}
'\.R	encode("ৠ"); convh2s();
'\.L	encode("ৡ"); convh2s();
\.l		encode("ৢ");
\.L		encode("ৣ"); convh2s();

\|		encode("।"); /* Borrowed from Devnagari */
\|\|	encode("॥"); /* Borrowed from Devnagari */

		/* 2532 is a Hole */
		/* 2533 is a Hole */
0		encode("০");
1		encode("১");
2		encode("২");
3		encode("৩");
4		encode("৪");
5		encode("৫");
6		encode("৬");
7		encode("৭");
8		encode("৮");
9		encode("৯");

\.		encode("॰"); /* Borrowed from Devnagari */

 /******************************************************************************
 * END RULES
 ******************************************************************************/


%%
 /******************************************************************************
 * BEGIN USER SUBROUTINES
 ******************************************************************************/


/*
** Define the yywrap to be an empty function
*/
int yywrap(void) { return 1; }


/*
** This function checks if the given character is possibly the beginning of a
** dependent vowel
*/
int isVowel(int c) {

	int d;

	switch (c) {
		case 'a':
		case 'A':
		case 'i':
		case 'I':
		case 'u':
		case 'U':
		case 'e':
		case 'E':
		case 'o':
		case 'O':
			return 1;

		case '.':
			/* We need further look ahead */
			d = input();
			if (d >= 0) {
				unput(d); // Undo the lookahead

				switch(d) {
					case 'r':
					case 'R':
					case 'l':
					case 'L':
						return 1;

					default:
						return 0;
				}
			}
			/* Fall through */

		default:
			return 0;
	}
}


/*
** This function checks if the following character is the beginning of a 
** consonant
*/
int isConsonant(int c) {

	int d;

	switch(c) {
		case '`':
		case 'w':
		case '\'':
		case 'k':
		case 'g':
		case 'C':
		case 'j':
		case 'T':
		case 'D':
		case 'N':
		case 't':
		case 'd':
		case 'n':
		case 'p':
		case 'b':
		case 'm':
		case 'y':
		case 'r':
		case 'l':
		case 'L':
		case 'v':
		case 'S':
		case 'X':
		case 's':
		case 'h':
		case 'H': /* As 'H' is an implicit consonant */
		case 'q':
		case 'K':
		case 'G':
		case 'z':
		case 'Z':
		case 'R':
		case 'f':
			return 1;
	
		case '.':
			/* We need further look ahead */
			d = input();
			if (d >= 0) {
				unput(d); // Undo the lookahead

				switch(d) {
					case 'N':
					case 'a':
						return 1;

					default:
						return 0;
				}
			}
			/* Fall through */

		default:
			return 0;		
	}
}


/*
** This function adds a Hamza (') if the following character(s) is a vowel only
** if the transliterate flag is set
*/
void addHamza() {
	if (!transliterate) return;

	int c = input(); // Attempt Lookahead
	if (c >= 0) {
		/* Note that the unput(c) cannot be moved out of the if block
		   This is because the isVowel() is going to do lookahead too! */
		if (isVowel(c)) {
			unput(c); // Undo the lookahead
			unput('\'');
		} else {
			unput(c); // Undo the lookahead
		}
	}
}


/*
** This function adds a virAm (.h) if the following character(s) is not a vowel.
** Note that a Zero-Width Non Joiner may also be inserted. All this only if the
** transliterate flag is set
*/
void addViram() {
	if (!transliterate) return;

	// If we are in a noViram zone, just return
	if (noViram) return;;

	int c = input(); // Attempt Lookahead

	if (c >= 0) {
		/* Note that the unput(c) cannot be moved above the if block
		   This is because the isVowel() is going to do lookahead too! */
		if (!isVowel(c)) {

			/* It could be a ".d" in which case the VIRAM decision is
			   delayed to beyond the ".d"  */
			if (c == '.') {

				int d = input();

				if (d >= 0) {
					unput(d);
					if (d == 'd') {
						unput(c);
						return;
					}
				}
			}

			printf("%s",VIRAM);

			/* Now check if the char c was a 'H'
			   If not, put the ZWNJ */
			if (c != 'H') {
				/* Put the ZWNJ only if the flag is set */
				if (useNonJoiner) printf("%s",ZWNJ);
				unput(c); // Undo the lookahead
			} else {
				/* Replace this now solitary 'H' by 'h' */
				unput('h');
			}

		} else {
			unput(c); // Undo the lookahead
		}
	} else {
		printf("%s",VIRAM); // What else can we do?
	}
}


/*
** This functions adds a .s (presumably after a long vowel) to indicate a vowel
** grade reduction. If a .n follows the vowel, the .s is added after the .n
*/
void reduceVowel() {

	int dot = input(); // Look ahead for a .n
	if (dot >= 0) {

		if (dot == '.') {

			int n = input();
			if (n >=0){

				if (n == 'n') {
					unput('s');
					unput('.');
					unput(n);
					unput(dot);
					return;
				}

				unput(n);
			}
		}
		unput(dot);
	}

	unput('s');
	unput('.');
}


/*
** This function corrects the .N to one of .N^k, .N^j, N, n, and m depending
** on the following consonant
*/
void correctAnusvar() {
	if (!transliterate) return;

	int c = input(); // Look ahead for the following consonant;
	unput(c); // Restore lookahead

	switch (c) {
		case 'k':
		case 'g':
			/* Replace by .N^k */
			unput('k');
			unput('^');
			unput('N');
			unput('.');
			break;

		case 'C':
		case 'j':
			/* Replace by .N^j */
			unput('j');
			unput('^');
			unput('N');
			unput('.');
			break;

		case 't':
		case 'd':
		case 'n':
			/* Replace by n */
			unput('n');
			break;

		case 'T':
		case 'D':
		case 'N':
			/* Replace by N */
			unput('N');
			break;

		case 'p':
		case 'b':
		case 'm':
			/* Replace by m */
			unput('m');
			break;
	}
}


/*
** This function decides if we should retain the 'a' that was scanned already
** or not. The 'a' is removed if the following character is a consonant or '.s'
** or '.h' or '. ' or generic delimiters. Alternatively, it is retained if we
** see a vowel or a '.d'. Note, this may need to be modified based upon future
** additions since this behavior is quite arbitrary. Retaining the 'a' is only 
** a way of flagging potential bugs in the input text
*/
void adjustDanda() {

	int c = input();

	if (c >= 0) {

		if (isVowel(c)) {
			ECHO;  // Retain the 'a' in the output
		} else if (c == '.') {

			int d = input(); // Lookahead for a d
			if (d >= 0) {
				unput(d);
				if (d == 'd') {
					ECHO; // Retain the 'a' in the output
				}
			}
		} 

		unput(c);
	}
}


/*
** This function prints the incoming string to stdout if the transliterate flag
** is set. Else just send the scanned token as is to stdout
*/
void encode(char* utfValue) {
	transliterate ? printf("%s",utfValue) : ECHO;
}


/*
** The convh2s is a function deliberately PUT here for backward compatibility
** purposes (so that my UCS scheme is compatible with my Jtrans scheme)
** It replaces any .h following a vowel to a .s
** 
** The Above was the ORIGINAL intention of using this function. Currently, 
** though this is modified to just ignore any .h following a vowel.
*/
void convh2s() {
	if (!transliterate) return;

	int dot = input();
	int h;

	if (dot >= 0) {
		if (dot == '.') {
			/* Look for a further 'h' */
			h = input();
			if (h >= 0) {
				if (h == 'h') {
					return;
#if 0
					unput('s');
#endif
				} else {
					unput(h);
				}
			}			
		}	
		unput(dot);
	}
}

 /******************************************************************************
 * END USER SUBROUTINES
 ******************************************************************************/

    Source: geocities.com/roshbaby/UCS/beng

               ( geocities.com/roshbaby/UCS)                   ( geocities.com/roshbaby)