/*
   bas2prot.c

   Converts a file of bases to a flat file database of proteins.

   Copyright 04/09/97 George Ruban, Joachim Reidl

   This software is in the public domain, and is freely available for
   any use, with no guarantee, on an "as is" basis. The authors would
   be interested in the ways in which this software is used and any
   problems that may be encountered. Users are requested, though not
   required, to send mail to the below addresses.

   This program and a paper describing it, "A THEORETICAL APPROACH TO
   IDENTIFY PUTATIVE SIGNAL SEQUENCES: PROVIDING EVIDENCE FOR
   SECRETED GENE PRODUCTS BY PURE COMPUTATIONAL ANALYSIS," are 
   available on the World Wide Web.
   paper:   http://www.geocities.com/SiliconValley/Vista/2013/joachim.html
   program: http://www.geocities.com/SiliconValley/Vista/2013/bas2prot.txt

   Written 02/07/1995 by George Ruban
   67 Thurston Str., Somerville, MA 02145, USA.
   http://www.geocities.com/SiliconValley/Vista/2013
   email: gmn@csa.bu.edu

   For Joachim Reidl
   Research Center for Infectious Disease, University of Wuerzburg,
   Roentgenring 11, 97070 Wuerzburg, Germany. Tel.: 0049 931 59509,
   e-mail: joachim.reidl@rzroe.uni-wuerzburg.de

   04/09/1997 - changed from C++ style comments, published on World Wide Web:
   03/31/1998 - replaced vonHeijne 1988 matrices with 1997 matrices,
        gave choice of 3 matrices to use.
*/

#include  /* for fprintf(), scanf(), fopen()... */
#include /* for strncpy() */
#include  /* for toupper(); */
#include /* for atoi() */

/* #define SWITCH_16_BITS  /* uncomment define for old DOS compilers */
#ifndef SWITCH_16_BITS  /* if the compiler doesn't use 16 bits only for switch() */
/* easy encoding of 3 chars into an unsigned long, to switch on */
#define e(a,b,c) (((((unsigned long)a<<8)+b)<<8)+c)
#else
/* if that's impossible - individual comparison of 3 chars to a string */
#define match(base,a,b,c) ((base[0] == (a)) && (base[1] == (b)) && (base[2] == (c)))
#endif /* SWITCH_16_BITS  */

#define MAXLINE 60 /* maximum length of a line of output for writeProtein() */
/* There are 4^3 = 64 different possible base triplets.  3 of them are
   stop codons. A length of 8000 amino acids should be plenty for
   a single protein, I hope... */
#define MAXPROTEIN 8000

#define MAXEUKARYOTIC 4087
#define MAXGRAMNEGATIVE 1196
#define MAXGRAMPOSITIVE 627

/* allow for different compilers */

char buffer[6][MAXPROTEIN]; /* 6 arrays to hold amino acids... */
char *buffptr[6] = {
  buffer[0], buffer[1], buffer[2],
  buffer[3] + MAXPROTEIN, buffer[4] + MAXPROTEIN, buffer[5] + MAXPROTEIN
}; /* pointer to current location in each buffer */

unsigned int minWeight = 0; /* minimum weight to write a protein */
unsigned int maxSlide = 0; /* maximum number of times to slide */
unsigned int minLength = 0; /* minimum length of a protein in order to write it. */
unsigned int maxPositiveResidues = 0; /* maximum distance of K || R from starting M for valid protein */
unsigned long bases = 0, proteins = 0;
/* number of bases read, proteins written */
/* number of amino acids excluded */
unsigned char nExcluded = 0;
unsigned char exclude[26] = ""; /* initialized to 0s */
/* 1 if the presence of corresponding amino acid means entire amino
   acid is to be excluded */

/* input and output files */
FILE *in, *out;

/* Amino acid counts for eukaryotic signal sequences from Gunnar von
   Heijne's paper in "Nucleic Acids Research" Volume 14, Number 11,
   1986, p.4683, IRL Press, Oxford.
   1 2 3 ... across the top, A-Y across the side */
const int cleavage [3] [26] [15] =
{
  /* Eukaryotic cleavage sites */
  {
    /*      -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   1   2  */
    /* A */{101,112,106,100,158,128,107,149,146,107,258, 80,458,141, 55},
    /* no B's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* C */{ 36, 32, 34, 31, 42, 66, 57, 39, 30, 34, 68, 21, 50, 23, 27},
    /* D */{  2,  0,  3,  0,  3,  2,  4,  5, 25, 14,  4, 44,  6, 68, 66},
    /* E */{  1,  2,  4,  5,  4,  6, 13,  9, 28, 27,  7, 66,  6, 92, 88},
    /* F */{ 75, 67, 80, 81, 65, 58, 92, 67, 25, 34,  7, 46,  6, 34, 28},
    /* G */{ 39, 24, 26, 34, 36, 50, 40, 29,108,125, 74, 38,184, 52, 57},
    /* H */{  5,  3,  4,  2,  4, 10, 14,  7, 23, 12,  5, 53,  2, 23, 22},
    /* I */{ 73, 67, 82, 59, 52, 52, 47, 69, 22, 43, 41, 18,  4, 37, 34},
    /* no J's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* K */{  4,  1,  1,  2,  0,  3,  3,  3, 20, 19,  7, 24,  4, 52, 40},
    /* L */{376,432,397,449,386,329,333,280, 98,147, 65,165, 21, 60, 51},
    /* M */{ 26, 15, 21, 19, 11, 30, 20, 11,  5, 12,  9, 14,  3, 14, 12},
    /* N */{  2,  4,  4,  6,  7,  7,  8,  7, 25, 14,  9, 37,  9, 26, 53},
    /* no O's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* P */{ 23, 13, 17,  9, 12, 13, 24, 60, 98, 74,  7, 17, 27, 11,166},
    /* Q */{  3,  6,  4,  9, 11, 19, 18, 13, 55, 40,  5, 72, 15,135, 42},
    /* R */{  7,  4,  4,  2,  3,  7,  9,  6, 34, 38,  5, 56, 20, 33, 40},
    /* S */{ 57, 47, 40, 44, 65, 76, 67, 66,117, 97,142,112,141, 91, 88},
    /* T */{ 32, 39, 30, 38, 38, 49, 42, 35, 70, 70,103, 44, 43, 39, 57},
    /* no U's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* V */{112,124,127,100, 99, 86, 79,129, 60, 72,191, 45,  8, 50, 56},
    /* W */{ 16, 13, 20, 11,  9, 13, 24, 21, 13, 13,  2, 33,  2,  9,  6},
    /* no X's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* Y */{ 17,  3,  6,  9,  6,  7, 10,  6,  8, 19,  2, 26,  2, 21, 23},
    /* no Z's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  },
  /* Gram-negative cleavage sites */
  {
    /*      -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   1   2 */
    /* A */{ 63, 58, 56, 64, 63, 39, 41, 78, 34, 41,156, 20,229,102, 19},
    /* no B's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* C */{ 10,  6,  5,  5,  6,  5,  4,  7,  3,  0,  3,  3,  0,  4,  0},
    /* D */{  0,  1,  1,  2,  1,  2,  2,  0,  0,  1,  2,  2,  0, 17, 43},
    /* E */{  0,  0,  0,  1,  1,  0,  1,  0,  1,  2,  1,  4,  1, 21, 40},
    /* F */{ 15, 12, 18, 12, 17, 32, 31,  7, 17, 13,  1, 28,  0,  2,  1},
    /* G */{ 20, 22, 17, 13, 11, 29, 19, 13, 41, 15,  5,  5, 11, 12, 15},
    /* H */{  1,  0,  0,  0,  1,  0,  2,  1,  7,  1,  1, 32,  0,  2,  0},
    /* I */{ 17, 22, 16, 15, 16,  5, 25, 11,  4,  9,  3,  5,  0,  3,  9},
    /* no J's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* K */{  0,  1,  2,  0,  0,  0,  1,  0,  1,  4,  1,  3,  0, 12,  4},
    /* L */{ 60, 61, 71, 63, 67, 82, 44,  7, 31, 12, 15, 42,  4,  8, 11},
    /* M */{ 11,  9,  9,  8, 17, 12, 17,  9,  4,  2,  0, 19,  0,  3,  2},
    /* N */{  2,  1,  2,  2,  3,  0,  2,  5,  5, 13,  2, 15,  3, 10, 13},
    /* no O's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* P */{  2,  4,  6,  6,  5,  3,  4, 19, 16, 40,  1,  0,  1,  1, 23},
    /* Q */{  0,  0,  0,  2,  2,  1,  0,  1, 13, 12,  3, 25,  0, 30, 15},
    /* R */{  3,  2,  1,  1,  2,  3,  2,  0,  6,  5,  3,  4,  1,  3,  2},
    /* S */{ 16, 21, 19, 30, 20, 17, 29, 72, 39, 53, 25, 18, 11, 12, 16},
    /* T */{ 15, 11, 20, 17, 13, 15, 18, 23, 26, 19, 10, 13,  3,  6, 32},
    /* no U's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* V */{ 28, 32, 21, 21, 16, 18, 18, 11, 16, 14, 33, 14,  2, 12, 17},
    /* W */{  0,  0,  1,  1,  1,  1,  2,  1,  1,  0,  0,  5,  0,  1,  3},
    /* no X's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* Y */{  1,  1,  0,  2,  3,  2,  4,  1,  0, 10,  1,  9,  0,  5,  1},
    /* no Z's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  },
  /* Gram-positive cleavage sites */
  {
    /*   -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   1   2 */
    /* A */{ 29, 27, 17, 32, 25, 17, 19, 27, 14, 13, 77, 18,114, 58, 15},
    /* no B's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* C */{  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0,  1,  0,  0,  0},
    /* D */{  0,  1,  1,  0,  1,  0,  0,  4,  3,  7,  1,  3,  0, 15, 13},
    /* E */{  2,  0,  1,  1,  1,  1,  1,  3,  4,  3,  5, 11,  2, 13, 15},
    /* F */{  8,  8,  6, 11, 14,  5, 12,  1,  1,  3,  2, 10,  0,  2,  0},
    /* G */{  6, 13, 14, 14, 12, 17, 12,  9, 16,  7,  4, 11,  5,  1, 10},
    /* H */{  0,  0,  0,  0,  1,  0,  2,  3,  1,  2,  1,  8,  0,  1,  4},
    /* I */{ 10, 10, 14, 10,  7,  8,  6,  8,  2,  5,  4,  2,  0,  3,  4},
    /* no J's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* K */{  1,  0,  1,  0,  0,  0,  0,  2,  6,  7,  0, 12,  2,  8,  4},
    /* L */{ 26, 31, 26, 27, 31, 32, 18, 10,  5,  5,  3,  5,  0,  1,  1},
    /* M */{  3,  5,  4,  3,  2,  7,  3,  3,  0,  5,  0,  1,  0,  0,  0},
    /* N */{  1,  1,  0,  1,  5,  2,  7,  3, 13,  9,  1,  6,  1,  6,  5},
    /* no O's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* P */{  3,  3,  5,  3,  9,  6, 16, 22, 20, 15,  1,  2,  0,  2, 16},
    /* Q */{  0,  3,  0,  0,  0,  4,  7,  5,  8,  5,  0, 17,  1,  8,  6},
    /* R */{  1,  0,  0,  2,  0,  1,  0,  1,  1,  3,  2,  1,  3,  1,  1},
    /* S */{ 14, 11, 17, 13, 13, 14, 12, 11, 18, 13, 12, 19,  8, 14, 17},
    /* T */{ 17, 13, 14,  7,  7, 12,  9, 16, 19, 28,  1,  4,  1,  4, 21},
    /* no U's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* V */{ 18, 12, 18, 15, 11, 10, 14, 12, 10,  9, 27,  3,  3,  2,  8},
    /* W */{  0,  0,  0,  1,  1,  2,  1,  0,  0,  0,  0,  2,  0,  1,  0},
    /* no X's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    /* Y */{  0,  1,  1,  0,  0,  2,  1,  1,  0,  2,  0,  5,  1,  1,  1},
    /* no Z's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  }  /* end Gram-positive */
}; /* end cleavage */

int  table = -1; /* 0, 1, or 2, eu- or pro-karyotic */
char lookup(const char base[3]);
char pair(char base);
char add(char amin, char frame);
void writeProtein(char frame, unsigned long position,
          const char *start, unsigned int len);
int weigh(const char *prot, unsigned int len);

/* Returns 'A' on input 'T', 'C' on input 'G', and vice versa, the
   "pair" base on the other part of the double helix */
char pair(char base)
{
  switch(base){
  case 'A':
    return 'T';
  case 'T':
    return 'A';
  case 'C':
    return 'G';
  case 'G':
    return 'C';
  default:
    return '?'; /* error condition */
  }
}

/* returns amino acid character for input base triplet */
char lookup(const char base[3])
{
  char c;
#ifndef SWITCH_16_BITS
  unsigned long i;

  i = e(base[0], base[1], base[2]);

  /* printf("looking up '%s' = %lu\n", base, i); */
  switch(i)
  {
  case e('T','T','T'):case e('T','T','C'):
    c = 'F'; break;

  case e('T','T','A'):case e('T','T','G'):
  case e('C','T','T'):case e('C','T','C'):
  case e('C','T','A'):case e('C','T','G'):
    c = 'L'; break;

  case e('A','T','T'):case e('A','T','C'):
  case e('A','T','A'):
    c = 'I'; break;
  case e('A','T','G'):
    c = 'M'; break;

  case e('G','T','T'):case e('G','T','C'):
  case e('G','T','A'):case e('G','T','G'):
    c = 'V'; break;

  case e('T','C','T'):case e('T','C','C'):
  case e('T','C','A'):case e('T','C','G'):
  case e('A','G','T'):case e('A','G','C'):
    c = 'S'; break;

  case e('C','C','T'):case e('C','C','C'):
  case e('C','C','A'):case e('C','C','G'):
    c = 'P'; break;

  case e('A','C','T'):case e('A','C','C'):
  case e('A','C','A'):case e('A','C','G'):
    c = 'T'; break;

  case e('G','C','T'):case e('G','C','C'):
  case e('G','C','A'):case e('G','C','G'):
    c = 'A'; break;

  case e('T','A','T'):case e('T','A','C'):
    c = 'Y'; break;
  case e('T','A','A'):case e('T','A','G'):
  case e('T','G','A'):
    c = '*'; break; /* special "stop codon" */

  case e('C','A','T'):case e('C','A','C'):
    c = 'H'; break;
  case e('C','A','A'):case e('C','A','G'):
    c = 'Q'; break;

  case e('A','A','T'):case e('A','A','C'):
    c = 'N'; break;
  case e('A','A','A'):case e('A','A','G'):
    c = 'K'; break;

  case e('G','A','T'):case e('G','A','C'):
    c = 'D'; break;
  case e('G','A','A'):case e('G','A','G'):
    c = 'E'; break;

  case e('T','G','T'):case e('T','G','C'):
    c = 'C'; break;
  case e('T','G','G'):
    c = 'W'; break;

  case e('C','G','T'):case e('C','G','C'):
  case e('C','G','A'):case e('C','G','G'):
  case e('A','G','A'):case e('A','G','G'):
    c = 'R'; break;

  case e('G','G','T'):case e('G','G','C'):
  case e('G','G','A'):case e('G','G','G'):
    c = 'G'; break;

  default:           /* error condition, return '?', for unknown */
    c = '?'; break;
  }
#else /* SWITCH_16_BITS */

  if( match(base,'T','T','T') || match(base,'T','T','C') )
    c = 'F';

  else if( match(base,'T','T','A') || match(base,'T','T','G') ||
           match(base,'C','T','T') || match(base,'C','T','C') ||
           match(base,'C','T','A') || match(base,'C','T','G') )
    c = 'L';

  else if( match(base,'A','T','T') || match(base,'A','T','C') ||
           match(base,'A','T','A') )
    c = 'I';
  else if( match(base,'A','T','G') )
    c = 'M';

  else if( match(base,'G','T','T') || match(base,'G','T','C') ||
           match(base,'G','T','A') || match(base,'G','T','G') )
    c = 'V';

  else if( match(base,'T','C','T') || match(base,'T','C','C') ||
           match(base,'T','C','A') || match(base,'T','C','G') ||
           match(base,'A','G','T') || match(base,'A','G','C') )
    c = 'S';

  else if( match(base,'C','C','T') || match(base,'C','C','C') ||
           match(base,'C','C','A') || match(base,'C','C','G') )
    c = 'P';

  else if( match(base,'A','C','T') || match(base,'A','C','C') ||
           match(base,'A','C','A') || match(base,'A','C','G') )
    c = 'T';

  else if( match(base,'G','C','T') || match(base,'G','C','C') ||
           match(base,'G','C','A') || match(base,'G','C','G') )
    c = 'A';

  else if( match(base,'T','A','T') || match(base,'T','A','C') )
    c = 'Y';
  else if( match(base,'T','A','A') || match(base,'T','A','G') ||
           match(base,'T','G','A') )
    c = '*'; /* special "stop codon" */

  else if( match(base,'C','A','T') || match(base,'C','A','C') )
    c = 'H';
  else if( match(base,'C','A','A') || match(base,'C','A','G') )
    c = 'Q';

  else if( match(base,'A','A','T') || match(base,'A','A','C') )
    c = 'N';
  else if( match(base,'A','A','A') || match(base,'A','A','G') )
    c = 'K';

  else if( match(base,'G','A','T') || match(base,'G','A','C') )
    c = 'D';
  else if( match(base,'G','A','A') || match(base,'G','A','G') )
    c = 'E';

  else if( match(base,'T','G','T') || match(base,'T','G','C') )
    c = 'C';
  else if( match(base,'T','G','G') )
    c = 'W';

  else if( match(base,'C','G','T') || match(base,'C','G','C') ||
          match(base,'C','G','A') || match(base,'C','G','G') ||
          match(base,'A','G','A') || match(base,'A','G','G') )
    c = 'R';

  else if( match(base,'G','G','T') || match(base,'G','G','C') ||
          match(base,'G','G','A') || match(base,'G','G','G') )
    c = 'G';

  else           /* error condition, return '?', for unknown */
    c = '?';
#endif /* SWITCH_16_BITS */
  return c;
}

/* returns the next legal base character, ignoring others */
int getBase(FILE* in)
{
  int c;

  do{
    c = getc(in);
    if(c==EOF)
      break;
    c = toupper(c);
  } while(c!='A' && c !='C' && c!= 'T' && c!= 'G');
  return c;
}

/* adds an amino acid to a frame, writes the protein if a stop codon. */
char add(char amin, char frame)
{
  /* printf("bases %ld: adding char %c to frame %u\n", bases, amin, frame); */
  if(amin == '*' || amin == 0)
  { /* special stop codon */
    unsigned int len; /* length of protein string */

    if(frame < 3)
    { /* writing forward */
      if(amin != 0)
        *buffptr[frame]++ = amin;
      len = buffptr[frame] - buffer[frame];
      writeProtein(frame, bases - len*3, buffer[frame], len);
      buffptr[frame] = buffer[frame]; /* reset buffptr */
    }
    else
    { /* frame >= 3, writing backward */
      len = buffer[frame] + MAXPROTEIN - buffptr[frame];
      writeProtein(frame, bases - len*3 - 1, buffptr[frame], len);
      /* -1 to position because the '*' isn't counted in yet */
      buffptr[frame] = buffer[frame] + MAXPROTEIN; /* reset buffptr */
      if(amin != 0)
        *--buffptr[frame] = amin;
    }
  } else { /* write normal, non-stop codon */
    if(frame < 3){
      *buffptr[frame]++ = amin;
      if(buffptr[frame] >= buffer[frame] + MAXPROTEIN)
        return 0;   /* overflow of buffer */
    } else { /* write backward */
      *--buffptr[frame] = amin;
      if(buffptr[frame] <= buffer[frame])
        return 0;   /* overflow of buffer */
    }
  }
  return 1;
}/* end add() */

/* Writes the protein to the file.

   A typical signal sequence needs to have the positively charged amino
   acids first (K or R or both) which are right next (5 to 10 residues) to
   the start amino acid (M). These positive residues (K,R) are not evaluated
   by the Hejne Matrix. Only the 15 residues which follows next to K,or R
   are evaluated, noted as -13, to 2, and also known as cleavage site. Now,
   if we could let the program first look for the presents of K and R right
   next (window 5 to 10 or variable) to the M and then allow the algorithem
   to start to evaluate the next coming residues according to the matrix
   (15 positions), we definitly would have a much higher yield in putative
   secreted proteins.

First task: i) find an open reading frame defined with
the start M and check for residues K or R right next (window 1 to 10,
optional), then ii) apply to such candidates, and only to such,  a
sliding window (1- 15, optional) which compares the following residues
to the matrix and stores them away if minimal weight factor level is
reached.

The first considered (to be compared with
the matrix) amino acid is position -13 then the secound is -12, down to
-1 (here the cut is produced) +1 means the mature protein starts.

  example:

     -13         -112
MFGKKRLALVALVALLLLAXA  (this is a good looking siganl sequence!!!!)
MFGGDFLALVALVALLLLAXA  (this is not a signal sequence at all, because it
lacks the positive residues K and/or R!!!!)

 */
void writeProtein(char frame,
                  unsigned long position,
                  const char *prot,
                  unsigned int len /* length of protein in aminos */ )
{
  unsigned int wt, maxWt = 0;
  unsigned int i, maxI; /* counter and its endpoint */
  unsigned int maxPos=0; /* position of max weight of protein */
  const char *start = prot; /* start of final protein */
  const char *residues; /* start of weighed residues in final protein */
  unsigned int endKR=0; /* distance until after last contiguous K/R amino of K/R sequence
                         starting within maxPositiveResidues of start */

  /* test for excluded amino acids */
  if(nExcluded > 0)
  {
    for(i=0; i= minLength && *start !='M')
    {
      start ++;
      len--;
    }
    if(len < minLength)
      return; /* too short, or no 'M' at all... */
    /* search for positive residues, K or R */
    maxI = (maxPositiveResidues > len) ? len : maxPositiveResidues;
    for(i=0; i maxWt){
          maxWt = wt;
          maxPos = i;
		}
	  }
      if(maxWt >= minWeight) break; /* found one */
	}
    /* otherwise reject this whole protein, try next starting one further down */
    len--;
    start++;
  }/* end for(;;) - found a good enough weight, now write it  */

  if(frame < 3)
    position += 3 * (start - prot);
  /* For the reversed frames the position is the location of the stop
     codon anyway, so where a protein starts doesn't affect that. */

  proteins++; /* wrote another protein */
  fprintf(out, "> %d / %lu (%u amino acids long,",
          frame+1, position + 1, len);
  fprintf(out, " weight %d, from position %d)\n", maxWt, maxPos + endKR + 1);
  while(len > MAXLINE){
    fwrite(start, sizeof(char), MAXLINE, out);
    len -= MAXLINE;
    start += MAXLINE;
    putc('\n', out);
  }
  fwrite(start, sizeof(char), len, out);
  putc('\n', out); /* blank line between proteins */
  putc('\n', out);
}

/* weighs the likelihood of the protein being valid, the higher the better */
int weigh(const char *prot, unsigned int len)
{
  register unsigned int i;
  unsigned int stop;
  int total = 0;
  
  if(len>15) stop = 15;
  else{
    stop = len;
    if(prot[stop -1] == '*') /* don't weigh the stop codon */
      stop --;
  }
  for(i=0; i maxWeight){
    fprintf(stderr, "Illegal weight %u.  Must be between 0 and %u.\n",
        minWeight, maxWeight);
    return(1);
  }

  printf("Maximum number of times to slide weighing window? ");
  scanf("%u", &maxSlide);
  if(maxSlide < 1)
    maxSlide = 1; /* '0' shorthand for '1' (?) */

  printf("Maximum distance of positive residue (K or R)\n"
         "from protein start codon (M) (1 to 10, or more)? ");
  scanf("%u", &maxPositiveResidues);
  if(maxPositiveResidues < 1)
    maxPositiveResidues = 10; /* '0' shorthand for '10' (?) */

  printf("Minimum length (in amino acids) to write a protein? ");
  scanf("%u", &minLength);
  if(minLength < 0 || minLength > MAXPROTEIN){
    fprintf(stderr, "Illegal minimum length %u, must be between 0 and %ld.",
        minLength, MAXPROTEIN);
    return(1);
  }
  if(minLength <1)
    minLength = 1; /* '0' shorthand for '1' (?) */
  printf("Any amino acids to be excluded from database\n "
     "(especially W, P, N, R, K, D, or C - 0 for none)? ");
  scanf("%s", aminos);
  nExcluded = 0;
  for(c=0; aminos[c] != '\0'; ++c){
    if(isalpha(aminos[c]) && aminos[c] >= 'A' && aminos[c] <= 'Z'){
      exclude[ aminos[c] - 'A'] = 1;
      ++nExcluded;
    }
  }
  if(nExcluded >= 1){
    printf("Excluding %u amino acids:\n", nExcluded);
    for(c = 0; c<26; ++c)
      if(exclude[c]) printf(" %c", c+'A');
    printf("\n");
  }
  printf("Beginning program.\n");

  /* read 2 first bases, to "prime" buffers */
  c = getBase(in);
  if(c==EOF){
    fprintf(stderr, "Input file ended on first character!\n");
    return(2);
  }
  fbuf[1] = (char)c;
  rbuf[1] = pair((char)c);
  
  c = getBase(in);
  if(c==EOF){
    fprintf(stderr, "Input file ended on second character!\n");
    return(2);
  }
  fbuf[2] = (char)c;
  rbuf[0] = pair((char)c);
  
  bases = 3; /* number of bases read */
  
  /* main loop, reading bases and writing amino acids to buffers */
  for(frame=0;(c=getBase(in))!=EOF; frame=(char)((frame+1)%3)){
    /* shift back buffers */
    fbuf[0] = fbuf[1];
    fbuf[1] = fbuf[2];
    fbuf[2] = (char)c;
    
    rbuf[2] = rbuf[1];
    rbuf[1] = rbuf[0];
    rbuf[0] = pair((char)c);
    
    if(!add(lookup(fbuf), frame) ||
       !add(lookup(rbuf), (char)(frame+3))){
      fprintf(stderr, "buffer overflow: a protein of over %ld amino acids\n"
          "has been found, and this program can't deal with it.\n",
          MAXPROTEIN);
      return(3);
    }
    if(++bases%1000 == 0)
      printf("Read %ld bases - wrote %ld proteins.\n", bases, proteins);
  }
  for(frame=0; frame<6; ++frame)
    add('\0', frame);        /* phony stop */
  fclose(in);
  fclose(out);
  printf("Done! %ld bases read, %ld proteins written.\n", bases, proteins);
  return 0;
}/* end main() */


/* This function is not used in the program any more. It was once used
   to reverse the output of the last 3 (backwards) frames, now that is
   done in the first pass. Function left in code "just in case". */
void reverseFile(const char *inName, const char *outName)
{
  FILE *ifp, *ofp;
  long begin, current;
  
  printf("Reversing file '%s' to '%s'.\n", inName, outName);
  ifp = fopen(inName, "r");
  if(ifp == NULL){
    fprintf(stderr, "could not open \"%s\"", inName);
    return;
  }
  ofp = fopen(outName, "w");
  if(ofp == NULL){
    fprintf(stderr, "could not open \"%s\"", outName);
    return;
  }
  begin = ftell(ifp);
  fseek(ifp, 0, SEEK_END);
  current = ftell(ifp);
  while(current>begin){
    fseek(ifp, current-1, SEEK_SET);
    current = ftell(ifp);
    putc(getc(ifp), ofp);
    if((current-begin)%1000 == 1000-1)
      printf("%ld characters reversed.\n", current-begin);
  }
  fclose(ifp);
  fclose(ofp);
}




    Source: geocities.com/siliconvalley/vista/2013

               ( geocities.com/siliconvalley/vista)                   ( geocities.com/siliconvalley)