
/*======================================================= 

                   STRAT.C

Program for association mapping in structured populations. 
Uses output from "structure".

                    by

JK Pritchard, M. Stephens, NA Rosenberg and P Donnelly.
Code written by Jonathan K. Pritchard

See additional details in README file.

=========================================================

This version differs from v 1.0 (5/2000) by  [x means done, - means checked]
These changes made June 2003, apart from (1) which was done wrong.

x-  1. fixing a bug in the calculation of the Chi-Square test statistic 
x-  2. fixing a bug spotted by Nina Wawro in the main test statistic.

Joel Gelernter comments (to be implemented):

x-  3. one-line input               [should work now--check]
x-  4. allow missing phenotypes     
x-  5. allow phenotypes to be 1,2; delete NUMPHENS; CHECK freqs file
x-  6. allow multiple columns of phenotype data
 
x- 7. allow command-line arguments [modify params.c]

   8. update manual to reflect changes
   9. perform simulations that repeat TPB article


=========================================================*/
#define VERSION "1.1 June 2003"




#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "ran.h"
#include "mymath.h"
/*#include "structure.h"*/
#include "params.h"
#include "datain.h"

/*hard code these */
#define SAVEFR         10       /*frequency of saving output file*/
#define NUMBOXES      100       /*store histogram of p-values with this many slots*/

#define DebugEM         0             
#define UNDERFLOW     1e-100    /*take logs when doubles get this small*/
#define STRATGENELEN       15


#define CanPos(ind,line) ((ind)*(2)+(line)) /*position in candidate data array*/
#define CanPPos(pop,allele) ((pop)*(MAXALLELES)+(allele)) /*position in gene freq array*/
#define PhAlFreqsPos(pop,phen,allele) ((pop)*(NUMPHENS)*(MAXALLELES)+(phen)*(MAXALLELES)+(allele))
#define PoolTransPos(code,pos) ((code)*(MAXALLELES+1) + (pos))

/*For use in EM only*/
#define PrPos(ind,line,pop) ((ind)*(2)*(MAXPOPS)+(line)*(MAXPOPS)+(pop))
#define SumPos(ind,line) ((ind)*2 + (line))
#define PrPhPos(ind,line,pop) ((ind)*(2)*(MAXPOPS)+(line)*(MAXPOPS)+(pop))
#define SumPhPos(ind,line) ((ind)*2 + (line))
#define NPos(pop,allele) ((pop)*(MAXALLELES) + (allele))
#define NPhPos(pop,phen,allele) ((pop)*(NUMPHENS)*(MAXALLELES)+(phen)*(MAXALLELES)+(allele))
#define NsumPhPos(pop,phen) ((pop)*(NUMPHENS)+(phen))
#define OldAlFreqsPos(pop,allele) ((pop)*(MAXALLELES) + (allele))
#define OldPhAlFreqsPos(pop,phen,allele) ((pop)*(NUMPHENS)*(MAXALLELES)+(phen)*(MAXALLELES)+(allele))

/*====================================================================
Here's a bunch of arrays that are used in the function EM, but I don't
want to have to do the memory allocation every time I go in there*/

double *Pr; /* NUMINDS*2*MAXPOPS prob allele (i,a) came from pop k*/
double *Sum; /* NUMINDS*2   Used to normalise Pr */
double *PrPh; /* NUMINDS*2*MAXPOPS */
double *SumPh; /* NUMINDS*2  */
double *N;    /* MAXPOPS*MAXALLS stores expected number of each allele in each pop*/
double *Nsum;  /* MAXPOPS    Used to normalise N */
double *NPh;  /* MAXPOPS*NUMPHENS*MAXALLS stores expected number of each allele 
		in each pop for given phenotype*/
double *NsumPh; /* MAXPOPS*NUMPHENS  Used to normalise NPh */
double *OldAlFreqs; /* MAXPOPS*MAXALLS */
double *OldPhAlFreqs; /* MAXPOPS*NUMPHENS*MAXALLS */

void FreeEMMemory();
void PrintFreqs(FILE *file, double *CandP,double *PhAlFreqs,
		int numalleles,int loc,int *PoolTrans, int *TransPheno,char *Markername);
void PrintGeneName(FILE * file, int loc, char *Markername);

/*===========================================================*/
void WelcomeSTRAT(FILE *file)
{
  fprintf(file,"\n\n");
  fprintf(file,"----------------------------------------------------\n");
  fprintf(file,"STRAT by Pritchard, Stephens, Rosenberg and Donnelly (AJHG, 2000)\n");
  fprintf(file,"            Code by J.K. Pritchard\n");
  fprintf(file,"            Version %s\n",VERSION);  
  fprintf(file,"----------------------------------------------------\n");

  fprintf(file,"\n\n");
}
/*-------------------------------------------------------*/
void Kill ()				/*exit program */
{
  printf ("\nExiting the program due to error(s) listed above.\n\n");
  exit (1);
}
/*-----------------------------------------*/
void FilePrelims(FILE *file)
{
  WelcomeSTRAT(file);
  fprintf(file,"Input data file = %s\n",DATAFILE);
  fprintf(file,"Structure Results file = %s\n",OUTFILE);
  fprintf(file,"Number of populations = %d\n",MAXPOPS);
  fprintf(file,"Using phenotype column: %d\n",PHENOTYPECOL);
  fprintf(file,"Number of simulated test stats per locus = %d\n",NUMSIMSTATS);
  fprintf(file,"EM stopping point = %1.3e\n",EMERROR);
  if (POOLFREQ) fprintf(file,"Alleles with fewer than %d copies pooled\n",POOLFREQ);
  else fprintf(file,"No pooling of rare alleles\n");
  fprintf(file,"\n\n\n");

}
/*-----------------------------------------*/
void FreeAllSTRAT(int *Geno,struct IND *Individual,int *Translation,
		  int *NumAlleles,double *Q,int *Phenotype, 
		  int *Candidate,int *SimCandidate,
		  double *CandP, double *SimCandP, double *PhAlFreqs,
		  int *PoolTrans)
{
  if (Geno!=NULL) free(Geno);               else printf("free error 1\n");
  if (Individual!=NULL) free(Individual);   else printf("free error 2\n");
  if (Translation!=NULL) free(Translation); else printf("free error 3\n");
  if (NumAlleles!=NULL) free(NumAlleles);   else printf("free error 4\n");
  if (Q!=NULL) free(Q);                     else printf("free error 6\n");
  if (Phenotype!=NULL) free(Phenotype);     else printf("free error 7\n");
  if (Candidate!=NULL) free(Candidate);     else printf("free error 8\n");
  if (SimCandidate!=NULL) free(SimCandidate);   else printf("free error 9\n");
  if (CandP!=NULL) free(CandP);             else printf("free error 10\n");
  if (SimCandP!=NULL) free(SimCandP);       else printf("free error 11\n");
  if (PhAlFreqs!=NULL) free(PhAlFreqs);     else printf("free error 12\n");
  if (PoolTrans!=NULL) free(PoolTrans);     else printf("free error 13\n");

}
/*-----------------------------------------*/
void InitCounts(int counts[NUMBOXES])
{
  int i;
  for (i=0;i<NUMBOXES;i++)   /*keeping track of p-value distribution*/  
    counts[i] = 0;
}
/*-----------------------------------------*/
void ReadQ(double *Q,struct IND *Individual)
/*read Q-hat from the output file produced by structure*/
{
  char name[STRLEN+20];
  char astring[LABELLEN];
  char qstring[20];
  FILE *QRESULTS;
  int ind;
  int pop;
  int strlength;
 
  if (MAXPOPS==1) 
    for (ind=0; ind<NUMINDS; ind++)
      Q[QPos(ind,0)] = 1.0;
  
  else
    {

      sprintf(name,"%s_q",OUTFILE);  /*open results file*/
      QRESULTS = fopen(name,"r");       

      if (QRESULTS==NULL)
	{
	  printf("WARNING: Unable to open results file %s produced by `structure'.\n",
		 name);
	  printf("Before running STRAT, you need to run `structure' with PRINTQHAT=1.\n");
	  printf("This produces an input file that is read by STRAT\n");
	  Kill();
	}
      
      else 
	{
	  for (ind=0; ind<NUMINDS; ind++)
	    {
	      /*read preamble for each line*/
	      strlength = ReadString(astring,LABELLEN,QRESULTS);
	      if (strlength==0)  /*label or ind number--ignore*/
		{
		  printf("Premature end of file %s\n",name);
		  Kill();
		}	  
	      if (POPDATA)  /*ignore this too*/
		{
		  strlength = ReadString(astring,LABELLEN,QRESULTS);
		  if (strlength==0)  /*label or ind number--ignore*/
		    {
		      printf("Premature end of file %s\n",name);
		      Kill();
		    }	
		}

	      /*read Q-hat*/
	      for (pop=0; pop<MAXPOPS; pop++) /*read in Q*/
		{
		  strlength = ReadString(qstring,20,QRESULTS);
		  if (strlength==0)
		    {
		      printf("Premature end of file %s\n",name);
		      Kill();
		    }
		  if (CheckIfValidDouble(qstring,ind,0))
		    Q[QPos(ind,pop)] = atof(qstring);
		}
	    }
	}
    }
}
/*-----------------------------------------*/
void  RemoveMissings(double *Q, double *modQ, int *Phenotype, int *modPhenotype)
     /*create modified arrays Q and Phenotype that remove individuals with 
       missing data*/
{
  int ind;
  int modind = 0;
  int pop;
  
  for (ind=0; ind<NUMINDS; ind++)   /*copy data for all individuals with pheno data*/
    if (Phenotype[ind] != MISSING)
      {
	modPhenotype[modind] = Phenotype[ind];
	for (pop=0; pop<MAXPOPS; pop++)
	  modQ[QPos(modind,pop)] = Q[QPos(ind,pop)];
	modind++;
      }
}
/*-----------------------------------------*/
void CopyPhenotype(int *Phenotype,struct IND *Individual)
{
  int ind;
  for (ind=0; ind<NUMINDS; ind++)
    Phenotype[ind] = Individual[ind].Phenotype;
}

/*-----------------------------------------*/
void TranslatePhenotypes(int *Phenotype,int *TransPheno)
     /*this function counts the number of each phenotype code, and
       translates them into (0,NUMPHENS-1) where NUMPHENS is the total 
       number of phenotype codes.  TransPheno stores the translation 
       code.  Missing Phenos are left as is*/

{
  int pos, k;			/*pos= position in array; k= # phens so far */
  int ind;
  int value;			/*value of current phen */
  int newmissing;
  int nummissing = 0;
  int *PhenoCounts;

  PhenoCounts = calloc(NUMINDS,sizeof(int));  
  if (PhenoCounts==NULL)
    {
      printf("Error in assigning memory (not enough space?)\n"); 
      Kill();
    } 


  /*worry about whether missing data value interferes with recoding */
  ORIGMISSINGPHENO = MISSINGPHENO;	/*store value of MISSINGPHENO in original data */
  if ((MISSINGPHENO >= 0) && (MISSINGPHENO < MAXALLELES))
    newmissing = -9;
  else
    newmissing = MISSINGPHENO;

  /*recode all phenotypes */

  k = 0;
  for (ind = 0; ind < NUMINDS; ind++)
    {
      value = Phenotype[ind];
      if (value == MISSINGPHENO)
	{
	  Phenotype[ind] = newmissing;
	  nummissing++;
	}

      else
	{
	  for (pos = 0; pos < k; pos++)
	    if (TransPheno[pos] == value)
	      break;

	  if (pos == k)  /*this phenotype not seen previously*/
	    {
	      TransPheno[pos] = value;
	      PhenoCounts[pos] = 0;
	      k++;
	    }

	  Phenotype[ind] = pos;	  /*recoding phenotype array*/
	  PhenoCounts[pos]++;
	}
    
    }

  NUMPHENS = k;

  printf("\n\n");
  printf("Phenotype column %d\n",PHENOTYPECOL);
  for (pos=0; pos<NUMPHENS; pos++)
    printf("%d Individuals with phenotype %d \n", PhenoCounts[pos], TransPheno[pos]);  
  printf("%d Individuals with missing phenotype",nummissing);
  printf("\n-------\n%d Total individuals\n\n",NUMINDS);

  MISSINGPHENO = newmissing;
  free(PhenoCounts);
}

/*-----------------------------------------*/
void CheckPhens(int *Phenotype)
     /*check that all phens are in the range {0...NUMPHENS-1}.  If not,
       halt the program. This function now retired*/
{
  int ind;
  int trouble=0;
  int phen;

  for (ind=0; ind<NUMINDS; ind++)
    {
      phen = Phenotype[ind];
      if ((phen<0) || (phen>=NUMPHENS))  /*phenotype not ok*/
	{
	  if (!trouble) printf("\n\n\n\n");
	  trouble = 1;
	  printf("WARNING: Individual %d, phenotype = %d\n",ind+1,phen);
	}
    }

  if (trouble)
    {
      printf("\n");
      printf("All phenotypes must coded as integers in the range\n");
      printf("{0...NUMPHENS-1}. (The current value of NUMPHENS is %d.)\n",NUMPHENS);
      Kill();
    }

}
/*-----------------------------------------*/
void RemoveMissingPhens(double *Q,struct IND *Individual,int *Geno)
/*Remove individuals with missing or out-of-range phenotypes.  I am 
thinking not to use this function because it seem a bit risky to
change NUMINDS.  Instead I will check the values of phenotypes, and
simply halt the program if there are out-of-range phenotypes*/
{
  int ind,line,loc,pop;
  int moveto=0;
  int phen;
  int i;

  for (ind=0; ind<NUMINDS; ind++)
    {
      phen = Individual[ind].Phenotype;
      if ((phen>=0) && (phen<NUMPHENS))  /*phenotype ok*/
	{
	  if (moveto < ind)        /*need to slide data up to delete invalid phens*/
	    {
	      Individual[moveto].Population = Individual[ind].Population;
	      Individual[moveto].PopFlag = Individual[ind].PopFlag;
	      Individual[moveto].Phenotype = Individual[ind].Phenotype;
	      for (i=0; i<LABELLEN; i++)
		Individual[moveto].Label[i] = Individual[ind].Label[i];

	      for (pop=0; pop<MAXPOPS; pop++)
		Q[QPos(moveto,pop)] = Q[QPos(ind,pop)];

	      for (line=0; line<2; line++)
		for (loc=0; loc<NUMLOCI; loc++)
		  Geno[GenPos(moveto,line,loc)] = Geno[GenPos(ind,line,loc)];
	    }
	  moveto++;
	}
      else if (phen != MISSING)
	printf("Warning: individual %d has phenotype %d which is out-of-range 0..%d, but not MISSING (%d)",ind,phen,NUMPHENS-1,MISSING);
    }
  
  if ((MISSING>=0) && (MISSING<NUMPHENS))
    {
      printf("\nWARNING: If there are any missing phenotype data, these will not be\n");
      printf("recognized, because the value for MISSING is in the range 0..%d used\n",
	     NUMPHENS-1);
      printf("used for phenotypes\n\n");
    }
  
  NUMINDS = moveto;
}
/*-----------------------------------------*/
int CopyCandidate(int *Candidate, int *Geno, int loc, int *Phenotype)
     /*For those individuals  who have phenotype data, take their genotypes
       at the current locus and copy them into a second array "Candidate". 
       Return the number of indivdiuals with valid phenotypes.*/
{
  int ind,line;
  int numwithpheno = 0;
  
  for (ind=0; ind<NUMINDS; ind++)
    if (Phenotype[ind] != MISSINGPHENO)
      {
	for (line=0; line<2; line++)
	  Candidate[CanPos(numwithpheno,line)] = Geno[GenPos(ind,line,loc)];  
	numwithpheno++;
      }

  return numwithpheno;
}
/*------------------------------------------------------------*/
double ComputeChiSq(int *Candidate, int *df, int *Phenotype, int numindsphen)
     /*requires phenotypes 0 and 1; pools allele freqs < POOLFREQ*/
{
  int *list1;  /*[NUMINDS*2]*/
  int *list2;  /*[NUMINDS*2]*/
  int ind;
  int list1num,list2num;
  double chisq;

  list1 = calloc(NUMINDS*2,sizeof(int));
  list2 = calloc(NUMINDS*2,sizeof(int));
  if ((list1==NULL)||(list2==NULL)) 
    printf("WARNING: error assigning memory in ComputeChiSq\n");

  list1num = list2num = 0;
  for (ind=0; ind<numindsphen; ind++)   /*iterate over individs with phenotype data*/
    {
      if (Phenotype[ind] ==0)  /*phenotype = 0*/
	{	  
	  list2[2*list2num] = Candidate[CanPos(ind,0)];
	  list2[2*list2num+1] = Candidate[CanPos(ind,1)];
	  list2num++;	  
	}
      else if (Phenotype[ind] ==1)                /*phenotype = 1*/
	{
	  list1[2*list1num] = Candidate[CanPos(ind,0)];
	  list1[2*list1num+1] = Candidate[CanPos(ind,1)];
	  /*printf("%d (%d) %d (%d) ",list1[2*list1num],2*list1num,
	    list1[2*list1num+1],2*list1num+1); */
	  list1num++;
	}
      else if ((Phenotype[ind] != 0)&&(Phenotype[ind] != 1))
	printf("Chisq not accurate if more than 2 phenotypes\n");	  
    }
 
  chisq = ChiSq(list1,2*list1num,list2,2*list2num,POOLFREQ,MISSING,&(*df));
  /*printf("Chisq: %1.4f (%d) [434]\n\n",chisq,*df);*/

  free(list1); free(list2);    
  return (chisq);
}
/*-----------------------------------------*/
void InitPoolTrans(int *Translation,int *PoolTrans,int numalleles,int loc)
     /*PoolTrans contains a list of the alleles corresponding to the
     pooled categories (0...numalleles).  Each row corresponds to a
     list of the actual alleles, with the value of MISSING marking the
     end of the row.*/
{
  int allele;
  for (allele=0; allele<numalleles; allele++)
    {
      PoolTrans[PoolTransPos(allele,0)] = Translation[TransPos(loc,allele)];
      PoolTrans[PoolTransPos(allele,1)] = MISSING;
    }
}
/*-----------------------------------------*/
void ReTranslate(int *PoolTrans,int to,int from,int numalleles)
     /*when two classes are merged, store the resulting information
     about what the alleles are.  to and from are the alleles that
     are being merged*/
{
  int i,j,stop;

  for (i=0; i<=MAXALLELES; i++)   /*find number of alleles in to*/
    if (PoolTrans[PoolTransPos(to,i)]==MISSING) break;
  stop = MAXALLELES-i;

  for (j=0; j<=stop; j++)   /*shift the from alleles into to*/
      {
	PoolTrans[PoolTransPos(to,i)]=PoolTrans[PoolTransPos(from,j)];
	if (PoolTrans[PoolTransPos(from,j)]==MISSING) break;
	i++; 
      }
  
  for (i=from; i<numalleles-1; i++)  /*slide the rest of the array up*/
    for (j=0; j<MAXALLELES+1; j++)
      PoolTrans[PoolTransPos(i,j)] = PoolTrans[PoolTransPos(i+1,j)]; 

  /*  printf("\nto=%d, from=%d\n",to,from);
  for (i=0; i<numalleles-1; i++)
    {
      for (j=0; j<MAXALLELES; j++)
	{
	  printf("%d ",PoolTrans[PoolTransPos(i,j)]);
	  if (PoolTrans[PoolTransPos(i,j)]==MISSING) break;
	}
      printf("\n");
      } */
}
/*-----------------------------------------*/
int DropRareAlleles(int *Candidate,int numalleles,int *Translation,
		    int *PoolTrans,int loc, int numindsphen)
/*Drop alleles that have lower than POOLFREQ copies in the data
set. (Combine rare classes, to achieve this.)  Return the new number
of alleles, and the revised data set (in Candidate). */
{
  int *count;
  int ind,line;
  int allele,i;
  int min1,min2,min3,all1,all2;


  InitPoolTrans(Translation,PoolTrans,numalleles,loc);
  if (POOLFREQ<=0) return numalleles;

  /*  printf("numalleles = %d\n",numalleles);*/
  count= calloc(numalleles,sizeof(int));
  if (count==NULL)
    {printf("Warning error assigning memory in DropRareAlleles\n"); return 0;}
  
  for (ind=0; ind<numindsphen; ind++)     /*compute allele counts*/
    for (line=0; line<2; line++)
      {
	allele = Candidate[CanPos(ind,line)];
	if (allele != MISSING)
	  count[allele]++;
      }

  /*  for (allele=0; allele<numalleles; allele++)
    printf("%d, ",count[allele]);
    printf("\n"); */

  do        /*combine undersized classes*/
    {                                          /*figure out 3 lowest counts*/
      min1 = min2 = min3 = 2*NUMINDS+1;        /*initialize with very high counts
						 then look for smaller counts*/
      all1 = all2 = 0;
      for (allele=0; allele<numalleles; allele++)  
	{
	  if (count[allele] < min1) 
	    {
	      min3 = min2;
	      min2 = min1; all2 = all1;
	      min1 = count[allele]; all1 = allele;
	    }
	  else if (count[allele] < min2) 
	    {
	      min3 = min2;
	      min2 = count[allele]; all2 = allele;
	    }
	  else if (count[allele] < min3) 
	    min3 = count[allele];
	}
      if (min1<=POOLFREQ)   /*combine classes 1 and 2*/
	{
	  if (all1 > all2) {allele = all2; all2 = all1; all1 = allele;}
	  count[all1] += count[all2];
	  for (i=all2; i<numalleles-1; i++)
	    count[i] = count[i+1];
	  ReTranslate(PoolTrans,all1,all2,numalleles); /*keep track of what's happening*/

	  for (ind=0; ind<numindsphen; ind++)     /*merge alleles, and slide others down*/
	    for (line=0; line<2; line++)
	      {
		allele = Candidate[CanPos(ind,line)];
		if (allele==all2) Candidate[CanPos(ind,line)] = all1;
		else if (allele>all2) Candidate[CanPos(ind,line)] = allele - 1;	  
	      }
	  numalleles--;
	   
	}
      /*for (allele=0; allele<numalleles; allele++)
	printf("%d, ",count[allele]);
	printf("\n");	 */
		
    } while ((min1+min2 <= POOLFREQ) || (min3 <= POOLFREQ) && (numalleles>=1));

  /*  for (allele=0; allele<numalleles; 
       allele++) count[allele] = 0;
  for (ind=0; ind<NUMINDS; ind++)     
    for (line=0; line<2; line++)
      {
	allele = Candidate[CanPos(ind,line)];
	if (allele != MISSING)
	  count[allele]++;
      }
  
  for (allele=0; allele<numalleles; allele++)
    printf("%d, ",count[allele]);
    printf("\n");  */
 
  /*printf("numalleles = %d\n",numalleles);*/
  return numalleles;
}

/*-----------------------------------------------*/
void SimulateCandidateInd(int *SimCandidate, int *Candidate,
			  double *Q, double *CandP, int numalleles,int numindsphen)
     /*simulate data at the candidate locus under the null model.
       Each allele is sampled independently conditional on P,Q.  */
{
  int ind,line,pop,allele;
  double *CumProbs; /*[MAXALLELES]*/
  double sum;
  double ranreal;

  CumProbs = calloc(MAXALLELES,sizeof(double));
  if (CumProbs==NULL) 
    printf("WARNING: couldn't assign memory in SimulateCandidateInd\n");

  for (ind=0; ind<numindsphen;ind++)
    {
      for (allele=0; allele<numalleles; allele++)  /*compute probs of each allele*/
	{
	  sum = 0;
	  for (pop=0; pop<MAXPOPS; pop++) 
	    sum += Q[QPos(ind,pop)] * CandP[CanPPos(pop,allele)];
	  if (allele==0) CumProbs[allele] = sum;
	  else CumProbs[allele] = CumProbs[allele-1] + sum;
	}
      
      for (line=0; line<2;line++)
	{
	  if (Candidate[CanPos(ind,line)]==MISSING)
	    SimCandidate[CanPos(ind,line)] = MISSING;
	  else
	    {
	      ranreal = RandomReal(0, CumProbs[numalleles-1]);
	      for (allele=0; allele<numalleles; allele++)  /*simulate alleles*/
		if ( ranreal < CumProbs[allele])
		  {
		    SimCandidate[CanPos(ind,line)] = allele;
		    break;
		  }
	      if (ranreal >= CumProbs[numalleles-1])  /*damage control*/ 
		SimCandidate[CanPos(ind,line)] = numalleles-1;
	    }
	}
    }

  free(CumProbs);
}
/*-----------------------------------------*/
void EM(int *Candidate,
	double *Q,
	int *Phenotype,
	int numalleles,
	double *CandP,
	double *PhAlFreqs,
	int numindsphen)

     /*This function estimates all the freqs that are needed for
       computing the likelihood ratio test. [Mainly coded by Matthew Stephens] */
{
  int a,i,j,k,phen;
  int count;
  double maxdiff1,maxdiff2;


  /*-----------------debugging[print a bunch of stuff]-------------------------*/

  /*  printf("numalleles = %d\n",numalleles);
  printf("NUMPHENS = %d\n",NUMPHENS);
  printf("numinds  = %d\n",numindsphen);

  printf("\nCandidate: ");
  for (i=0; i<numindsphen; i++)
    printf(" %d,%d ",Candidate[CanPos(i,0)],Candidate[CanPos(i,1)]);
  printf("\n\n");

  printf("Phenotype: ");
  for (i=0; i<numindsphen; i++)
    printf(" %d ",Phenotype[i]);
  printf("\n\n");

  printf("Q: ");
  for (i=0; i<numindsphen; i++)
    {
      for (k=0; k<MAXPOPS; k++)
	printf("%1.3f ",Q[QPos(i,k)]);
      printf("         ");
    }
    printf("\n\n"); */

  /*-----------------end debugging---------------------*/



                                                               //printf("645\n");
  for(j=0;j<numalleles;j++)
    for(k=0;k<MAXPOPS;k++)
      {
	CandP[CanPPos(k,j)]=1.0/numalleles;
	OldAlFreqs[OldAlFreqsPos(k,j)]=1.0/numalleles;
	for(phen=0;phen<NUMPHENS;phen++)
	  {
	    PhAlFreqs[PhAlFreqsPos(k,phen,j)]=1.0/numalleles;
	    OldPhAlFreqs[OldPhAlFreqsPos(k,phen,j)]=1.0/numalleles;
	  }
      }
                                                               //printf("657\n");
  /*if (DebugEM) PrintFreqs(stdout,CandP,PhAlFreqs,numalleles,0);*/

  count = 0;
  do
    {
      count++;
      /* The E-step */
  
      /* initialise Pr and Sum to 0*/
      for(i=0;i<numindsphen;i++)
	for(a=0;a<2;a++)
	  {
	    Sum[SumPos(i,a)]=0.0;
	    SumPh[SumPhPos(i,a)]=0.0;
	    for(k=0;k<MAXPOPS;k++)   /*could delete this loop--JP*/
	      {
		Pr[PrPos(i,a,k)]=0.0;
		PrPh[PrPhPos(i,a,k)]=0.0;
	      }
	  }

      for(i=0;i<numindsphen;i++)
	for(a=0;a<2;a++)
	  if(Candidate[CanPos(i,a)]!=MISSING)
	    for(k=0;k<MAXPOPS;k++)
  	      {
		/*printf("%d ",Candidate[CanPos(i,a)]);*/

		Sum[SumPos(i,a)]+=Q[QPos(i,k)]*CandP[CanPPos(k,Candidate[CanPos(i,a)])];
		Pr[PrPos(i,a,k)]=Q[QPos(i,k)]*CandP[CanPPos(k,Candidate[CanPos(i,a)])];
		SumPh[SumPhPos(i,a)]+=Q[QPos(i,k)]*PhAlFreqs[PhAlFreqsPos(k,Phenotype[i],Candidate[CanPos(i,a)])];
		PrPh[PrPhPos(i,a,k)]=Q[QPos(i,k)]*PhAlFreqs[PhAlFreqsPos(k,Phenotype[i],Candidate[CanPos(i,a)])];
	      }

      /* normalise probabilities*/
      for(i=0;i<numindsphen;i++)
	for(a=0;a<2;a++)
	  for(k=0;k<MAXPOPS;k++)
	    {
	      Pr[PrPos(i,a,k)]/=Sum[SumPos(i,a)];
	      PrPh[PrPhPos(i,a,k)]/=SumPh[SumPhPos(i,a)];
	    }
      /* Calculate expected number of each allele in each population*/
      /* (in total and within each phenotype) */
      for(k=0;k<MAXPOPS;k++)
	{
	  Nsum[k]=0;
	  for(phen=0;phen<NUMPHENS;phen++)
	    NsumPh[NsumPhPos(k,phen)]=0;
	  for(j=0;j<numalleles;j++)
	    {
	      N[NPos(k,j)]=0;
	      for(phen=0;phen<NUMPHENS;phen++)
		NPh[NPhPos(k,phen,j)]=0;	      
	    }
	}

      for(i=0;i<numindsphen;i++)
	for(a=0;a<2;a++)
	  if(Candidate[CanPos(i,a)]!=MISSING)
	    {
	      for(k=0;k<MAXPOPS;k++)
		{
		  N[NPos(k,Candidate[CanPos(i,a)])]+=Pr[PrPos(i,a,k)];
		  Nsum[k]+=Pr[PrPos(i,a,k)];
		  NPh[NPhPos(k,Phenotype[i],Candidate[CanPos(i,a)])]
		    +=PrPh[PrPhPos(i,a,k)];
		  NsumPh[NsumPhPos(k,Phenotype[i])]+=PrPh[PrPhPos(i,a,k)];
		}
	    }
                                                                 //printf("728\n");
      /* The M-step */
     
      for(k=0;k<MAXPOPS;k++)
	for(j=0;j<numalleles;j++)
	  {
	    if(Nsum[k]>0)
	      CandP[CanPPos(k,j)]=N[NPos(k,j)]/Nsum[k];
	    else
	      CandP[CanPPos(k,j)]=0;
	    for(phen=0;phen<NUMPHENS;phen++)
	      {
		if(NsumPh[NsumPhPos(k,phen)]>0)
		  PhAlFreqs[PhAlFreqsPos(k,phen,j)]=NPh[NPhPos(k,phen,j)]/NsumPh[NsumPhPos(k,phen)];
		else
		  PhAlFreqs[PhAlFreqsPos(k,phen,j)]=0;
		
	      } 
	  }

      /*if (DebugEM) PrintFreqs(stdout,CandP,PhAlFreqs,numalleles,0);*/
                                                               //printf("749\n");
      /* check for convergence */
      maxdiff1 = 0.0;
      maxdiff2 = 0.0;
      
      for(j=0;j<numalleles;j++){
	for(k=0;k<MAXPOPS;k++)
	  {
	    for(phen=0;phen<NUMPHENS;phen++)
	      {
		if ((PhAlFreqs[PhAlFreqsPos(k,phen,j)] - OldPhAlFreqs[OldPhAlFreqsPos(k,phen,j)]) > maxdiff2)
		  maxdiff2 = PhAlFreqs[PhAlFreqsPos(k,phen,j)] - OldPhAlFreqs[OldPhAlFreqsPos(k,phen,j)];
		else if ((OldPhAlFreqs[OldPhAlFreqsPos(k,phen,j)] - PhAlFreqs[PhAlFreqsPos(k,phen,j)]) > 
			 maxdiff2)
		  maxdiff2 = OldPhAlFreqs[OldPhAlFreqsPos(k,phen,j)] - PhAlFreqs[PhAlFreqsPos(k,phen,j)];
		if (maxdiff2 > EMERROR) break; 	      
	      }
	    if (maxdiff2 > EMERROR) break;
	  }
	if (maxdiff2 > EMERROR) break;
      }
      
      if (maxdiff2 < EMERROR)
	for(j=0;j<numalleles;j++)
	  {
	    for(k=0;k<MAXPOPS;k++)
	      {
		if ((CandP[CanPPos(k,j)] - OldAlFreqs[OldAlFreqsPos(k,j)]) > maxdiff1)
		  maxdiff1 = CandP[CanPPos(k,j)] - OldAlFreqs[OldAlFreqsPos(k,j)];
		else if ((OldAlFreqs[OldAlFreqsPos(k,j)] - CandP[CanPPos(k,j)]) > maxdiff1)
		  maxdiff1 = OldAlFreqs[OldAlFreqsPos(k,j)] - CandP[CanPPos(k,j)];
		if (maxdiff1 > EMERROR) break; 
	      }
	    if (maxdiff1 > EMERROR) break; 
	  }
      
      for(j=0;j<numalleles;j++)
	for(k=0;k<MAXPOPS;k++)
	  {
	    OldAlFreqs[OldAlFreqsPos(k,j)]=CandP[CanPPos(k,j)];
	    for(phen=0;phen<NUMPHENS;phen++)
	      OldPhAlFreqs[OldPhAlFreqsPos(k,phen,j)]=PhAlFreqs[PhAlFreqsPos(k,phen,j)];
	  }

      /*print out current frequencies by pop, allele then phenotype*/
      /*      printf("Frequencies at end of EM cycle:\n");
      for(k=0;k<MAXPOPS;k++)
	for (j=0;j<numalleles;j++)	  
	  {
	    printf("Pop %d Allele %d : %1.3f : ",k,j,CandP[CanPPos(k,j)]); 
	    for(phen=0;phen<NUMPHENS;phen++)
	      {
		printf("%1.3f ",PhAlFreqs[PhAlFreqsPos(k,phen,j)]);		
	      }
	    printf("\n");
	    } 
      printf("%f %f %d : %f\n",maxdiff1,maxdiff2,count, EMERROR); *//*end printout */

    } while ((maxdiff1 > EMERROR)||(maxdiff2 > EMERROR));
                                                                       //printf("794\n");
  /*printf("%f %f %d\n",maxdiff1,maxdiff2,count); */
    
}
/*-----------------------------------------*/
void PrintFreqs(FILE *file,double *CandP,double *PhAlFreqs,
		int numalleles,int loc,int *PoolTrans, int *TransPheno,char *Markername)
{
  int phen,pop,allele,pos;
  int num;

  fprintf(file,"Locus %d: ",loc);
  if (MARKERNAMES) {PrintGeneName(file,loc-1,Markername);}
  fprintf(file,"  Estimated allele frequencies\n");

  for (allele=0; allele < numalleles; allele++)
    {
      fprintf(file,"Allele");
      num=0;
      for (pos=0; pos<MAXALLELES; pos++)
	{
	  if (PoolTrans[PoolTransPos(allele,pos)] != MISSING)  num++;
	  else break;
	}
      if (num>1) fprintf(file,"s");

      for (pos=0; pos<MAXALLELES; pos++)
	{
	  if (PoolTrans[PoolTransPos(allele,pos)] != MISSING) 
	    fprintf(file," %d",PoolTrans[PoolTransPos(allele,pos)]);
	  else break;
	}
      fprintf(file,"\n");

      for (pop=0; pop<MAXPOPS; pop++)
	{
	  fprintf(file,"%1.3f (",CandP[CanPPos(pop,allele)]);

	  for (phen=0; phen<NUMPHENS; phen++)       /*print the original phenotypes*/
	    fprintf(file," %3d  ",TransPheno[phen]);
	  fprintf(file,"\n");

	  for (phen=0; phen<NUMPHENS; phen++)   /*print frequencies*/
	    {
	      fprintf(file,"%1.3f",PhAlFreqs[PhAlFreqsPos(pop,phen,allele)]);
	      if (phen < NUMPHENS-1) fprintf(file," ");
	      else fprintf(file,")     ");
	    }
	  fprintf(file,"\n");
	}
      fprintf(file,"\n");
    }
  fprintf(file,"\n");
}
/*-----------------------------------------*/
double LPDataWAssoc(int *Candidate,
		    double *Q,
		    int *Phenotype,
		    double *PhAlFreqs,
		    int numindsphen)
{
  /*returns the probability of the allelic configuration at the candidate
    locus under the alternative model (ie using the phenotypes) */

  /*slow--taking more logs than necessary*/
  int ind,line,allele,allele1,allele2,pop,phen;
  double sumlogs = 0.0;
  double runningtotal;
  double prob,prob1,prob2;

  runningtotal = 1.0;      
  for (ind=0; ind<numindsphen; ind++)
    {
      phen = Phenotype[ind];
      /*printf("ind %d: runningtotal=%1.3e sumlogs=%1.3e  Underflow=%1.3e\n",ind,runningtotal,sumlogs,UNDERFLOW);*/
      /*runningtotal = 1.0; */  /*removed 3 May 03, after bug pointed out
				  by Nina Wawro*/
      for (line=0; line<2; line++)
	{
	  allele = Candidate[CanPos(ind,line)];
	  if (allele != MISSING)
	    {
	      prob = 0.0;
	      for (pop=0; pop<MAXPOPS; pop++)
		prob += Q[QPos(ind,pop)]*PhAlFreqs[PhAlFreqsPos(pop,phen,allele)];
	      runningtotal *= prob;
	      if (runningtotal<UNDERFLOW)
		{
		  sumlogs += log(runningtotal);
		  runningtotal = 1.0;
		}
	    }
	}
    }
  
  sumlogs += log(runningtotal);
  return sumlogs;
}
/*-----------------------------------------*/
double LPDataNoAssoc(int *Candidate,
		     double *Q,
		     int *Phenotype,
		     double *CandP,
		     int numindsphen)
{
  /*returns the probability of the allelic configuration at the candidate
    locus under the null model (ie not using the phenotypes) */

  /*slow--taking more logs than necessary*/
  int ind,line,allele,pop;
  double sumlogs = 0.0;
  double prob;
  double runningtotal;

  runningtotal = 1.0;        
  for (ind=0; ind<numindsphen; ind++)
    for (line=0; line<2; line++)
      {
	allele = Candidate[CanPos(ind,line)];
	if (allele != MISSING)
	  {
	    prob = 0.0;
	    for (pop=0; pop<MAXPOPS; pop++)
	      prob += Q[QPos(ind,pop)]*CandP[CanPPos(pop,allele)];
	    runningtotal *= prob;
	    if (runningtotal<UNDERFLOW)
	      {
		sumlogs += log(runningtotal);
		runningtotal = 1.0;
	      }
	  }
      }
	
  sumlogs += log(runningtotal);
  return sumlogs;

}

/*-----------------------------------------------*/
double ComputeStat(int *Candidate,
		   double *Q,
		   int *Phenotype,
		   int numalleles,
		   double *CandP,
		   double *PhAlFreqs,
		   int numindsphen)
{
  double teststat;

  EM(Candidate,Q,Phenotype,numalleles,CandP,PhAlFreqs, numindsphen);   
    
  teststat = LPDataWAssoc(Candidate,Q,Phenotype,PhAlFreqs, numindsphen) 
    - LPDataNoAssoc(Candidate,Q,Phenotype,CandP, numindsphen);
 
  return teststat;

}
/*-----------------------------------------------*/
void AllocateEMMemory()
{
  Pr = calloc(NUMINDS*2*MAXPOPS,sizeof(double));
  Sum = calloc(NUMINDS*2,sizeof(double));
  PrPh = calloc(NUMINDS*2*MAXPOPS,sizeof(double));
  SumPh = calloc(NUMINDS*2,sizeof(double));
  N = calloc(MAXPOPS*MAXALLELES,sizeof(double));
  Nsum = calloc(MAXPOPS,sizeof(double));
  NPh = calloc(MAXPOPS*NUMPHENS*MAXALLELES,sizeof(double));
  NsumPh = calloc(MAXPOPS*NUMPHENS,sizeof(double));
  OldAlFreqs = calloc(MAXPOPS*MAXALLELES,sizeof(double));
  OldPhAlFreqs = calloc(MAXPOPS*NUMPHENS*MAXALLELES,sizeof(double));

  if ((Pr==NULL)||(Sum==NULL)||(PrPh==NULL)||(SumPh==NULL)||(N==NULL)
      ||(Nsum==NULL)||(NPh==NULL)||(NsumPh==NULL)||(OldAlFreqs==NULL)
      ||(OldPhAlFreqs==NULL))
    {
      printf("Error in assigning memory for EM (not enough space?)\n"); 
      FreeEMMemory();  
      Kill();
    } 
}
/*-----------------------------------------------*/
void FreeEMMemory()
{
    if (Pr!=NULL) free(Pr);             else printf("EM free error 1\n");
    if (Sum!=NULL) free(Sum);           else printf("EM free error 2\n");
    if (PrPh!=NULL) free(PrPh);         else printf("EM free error 3\n");
    if (SumPh!=NULL) free(SumPh);       else printf("EM free error 4\n");
    if (N!=NULL) free(N);               else printf("EM free error 5\n");
    if (Nsum!=NULL) free(Nsum);         else printf("EM free error 6\n");
    if (NPh!=NULL) free(NPh);           else printf("EM free error 7\n");
    if (NsumPh!=NULL) free(NsumPh);     else printf("EM free error 8\n");
    if (OldAlFreqs!=NULL) free(OldAlFreqs);      else printf("EM free error 9\n");
    if (OldPhAlFreqs!=NULL) free(OldPhAlFreqs);  else printf("EM free error 10\n");
 }
/*-----------------------------------------------*/
void PrintTable(FILE *file, int counts[NUMBOXES],int loc)
{
  int i;
  double sum;
  double cum=0.0;
  int total = 0;

  fprintf(file,"\n\nSummary of distribution of p-values\n\n");

  fprintf(file,"\n");
  for (i=0;i<NUMBOXES;i++)
    {
      if (i%5==0) 
	{
	  sum=0;
	  fprintf(file,"%1.2f---%1.2f:    ",
		  (double) (i)/NUMBOXES,(double) (i+5)/NUMBOXES);
	}
      fprintf(file,"%1.3f ",(double) counts[i]/(loc));
      sum += ((double) counts[i]/loc);
      total += counts[i];
      if ((i+1)%5==0) 
	{
	  cum += sum;
	  fprintf(file,"    %1.4f",sum);
	  fprintf(file,"  %1.4f",cum);
	  fprintf(file,"  %1.4f\n",cum-((double)(i+1)/NUMBOXES));
	}
    }
  fprintf(file,"\n");

  /*-----alternate version----------*/

  /*  for (i=0;i<NUMBOXES;i++)
    {
      if (i%5==0) sum=0;
      fprintf(file,"%1.3f---%1.3f ",(double) (i+1)/NUMBOXES,
	     (double) counts[i]/(loc));
      sum += counts[i];
      if ((i+1)%5==0) 
	{
	  cum += (double)sum/(loc);
	  fprintf(file,"  %1.5f",(double)sum/(loc));
	  fprintf(file,"  %1.5f\n",cum-(double)(i+1)/NUMBOXES);
	}
    }
    fprintf(file,"\n");  */


}
/*----------------------------------------------------*/
void PrintGeneName(FILE * file, int loc, char *Markername)
{
  int i;
  for (i=0; i<STRATGENELEN; i++)
    {
      if (Markername[MarkernamePos(loc,i)] != '\0')
	fprintf(file,"%c%",Markername[MarkernamePos(loc,i)]);
      else 
	{
	  if (i==0) fprintf(file,"XXX");
	  fprintf(file," ");
	  break;
	}
    }
}
/*-----------------------------------------------*/
void Printing(FILE *file, int loc,double chisq,int numalls,
	      double teststat,double pvalue,int counts[NUMBOXES],
	      int upcounts, char *Markername)
     /*upcounts is an indicator to say whether or not to add the pvalue
       to the array counts.  (ie you only want to do this once per loop.)*/
{
  if (numalls > 1)
    {
      fprintf(file, "%3d: chisq= %2.3lf %2d df; TS = %1.2f, p = %1.5e",loc+1,chisq,numalls-1,
	      teststat,pvalue);

      if (MARKERNAMES) 
	{fprintf(file,"  "); PrintGeneName(file,loc,Markername);}
      
      if (pvalue < 0.05) fprintf(file,"     *");
      if (pvalue < 0.01) fprintf(file,"*");
      if (pvalue < 0.001) fprintf(file,"*");
      fprintf(file,"\n");

      if (upcounts)
	{
	  if (pvalue==1.0) counts[NUMBOXES-1]++;
	  else counts[(int) (NUMBOXES*pvalue)]++;
	}
    }
  else fprintf(file, "%3d: no test--only 1 allele remained after pooling\n",loc+1);
}
/*===========================================================*/

/*MAIN
--------------------------------------------------
Plan:

--reserve memory for arrays
--ReadInputFile
--Read the Q's
--Clean output file
Run STRAT stuff 

--Deal with missing pheno data
Print data summary

*/

int main(int argc, char *argv[])
{
/*input data---------*/
  int *Geno;              /*NUMINDSx2xNUMLOCI: genotypes*/
  struct IND *Individual; /*NUMINDS: records for each individual*/
  int *Translation;      /*NUMLOCIxMAXALLELES: value of each coded allele*/
  int *NumAlleles;       /*NUMLOCI: number of alleles at each locus*/
  double *Q;             /*NUMINDSxMAXPOPS:  Q=ancestry of individuals*/
  int *Phenotype;        /*NUMINDS: copy of the phenotype data in Individual*/

  double *modQ;             /*NUMINDSxMAXPOPS:  Q=ancestry of individuals; 
			      individuals with phenotypes only*/
  int *modPhenotype;        /*NUMINDS: copy of the phenotype data in Individual; 
			      individuals with phenotypes only*/

  int *TransPheno;       /*NUMINDS: phenotypes recoded to (0,NUMPHENS-1); store originals*/
  int *PoolTrans;        /*MAXALLELESxMAXALLELES: translation of pooled alleles*/
/*manipulating candidate data*/
  int *Candidate;        /*NUMINDSx2: observed genotypes at candidate locus*/
  int *SimCandidate;     /*NUMINDSx2: genotypes at simulated candidate locus C'*/
/*allele frequencies*/
  double *CandP;         /*MAXPOPS*MAXALLELES: allele freqs in each pop*/  
  double *SimCandP;      /*MAXPOPS*MAXALLELES: allele freqs in each pop at C'*/  
  double *PhAlFreqs;      /*MAXPOPS*NUMPHENS*MAXALLS: current phenotype allele
			   frequencies (either true candidate, or simulated)*/
/*storing data*/
  int counts[NUMBOXES];  /*histogram of p-values*/
  double chisq;          /*chisq test stat for locus*/
  double sumchisq = 0.0; /*sum of chisq stats*/
  int df;                /*degrees of freedom for locus*/
  int sumdf=0;           /*sum of df*/
  double teststat,simteststat; /*value of teststat at C and C'*/
  int numbigger;         /*number of simteststats bigger than observed*/
  int numalls;           /*number of alleles after dropping rare alleles*/
  int loc,reps;          /*loop variables*/
  int start,stop;
  double pvalue;
  int numindsphen;      /*number of individuals with phenotype data*/
  int testssofar=0;      /*number of loci with valid tests*/

  //Added for structure format compatibility; not used in this program
  //except for Markername (printing results)
  char *Markername;		/*STRATGENELEN*NUMLOCI */
  double *Mapdistance;		/*NUMLOCI */
  double *Phase;		/*NUMLOCI*NUMINDS */

  /*output files*/
  FILE *PvalsFile,*FreqsFile;
  char outname1[STRLEN+20],outname2[STRLEN+20];
  char STRATFILE[STRLEN + 1];
  int i;

  /*=====Code for getting started=============================*/

  WelcomeSTRAT(stdout);   /*welcome*/
  
  // changed by William, not consistent with old version
  GetParams(1,argc,argv); /*read in parameter values*/

  strcpy(STRATFILE,OUTFILE);   /* default is that STRAT output file has same prefix name
				  as structure output file _q; but this can be overridden
			       by command line argument below.  The next bit of code would
			       fit better in params.c but Daniel has codemastership at this time*/
  for (i=0; i<argc-1; i++)
    if (strcmp(argv[i],"-S")==0) sprintf (STRATFILE,"%s",argv[i+1]); 
  

  Geno = calloc(2*NUMLOCI*NUMINDS,sizeof(int));
  Individual = calloc(NUMINDS,sizeof(struct IND));
  if ((Geno==NULL)||(Individual==NULL)) 
    {printf("Error in assigning memory (not enough space?)\n"); Kill();} 
  
  //these arrays for compatibility with Structure, not really used here
  Mapdistance = calloc (NUMLOCI, sizeof (double));
  Phase = calloc (NUMLOCI * NUMINDS, sizeof (double));
  Markername = calloc (STRATGENELEN*NUMLOCI, sizeof (char));


  if (RECESSIVEALLELES) 
    {printf("STRAT cannot be run with RECESSIVEALLELES turned on\n"); Kill();}

  ReadInputFile(Geno,Mapdistance,Markername,Individual,Phase, NULL);//last argument is "Recessive"  
  /*read in data file*/
  MAXALLELES = FindMaxAlleles(Geno, NULL);  //last argument is "Recessive"

  /*=============set aside memory space=====================*/

  /*Note NUMPHENS is no longer pre-defined--not set until TranslatePhenotypes*/

  Translation = calloc(NUMLOCI*MAXALLELES,sizeof(int));
  NumAlleles = calloc(NUMLOCI,sizeof(int));
  Q = calloc(NUMINDS*MAXPOPS,sizeof(double));
  Phenotype = calloc(NUMINDS,sizeof(int));
  modQ = calloc(NUMINDS*MAXPOPS,sizeof(double));
  modPhenotype = calloc(NUMINDS,sizeof(int));
  TransPheno = calloc(NUMINDS,sizeof(int));   
  Candidate = calloc(NUMINDS*2,sizeof(int));
  SimCandidate = calloc(NUMINDS*2,sizeof(int));
  CandP = calloc(MAXPOPS*MAXALLELES,sizeof(double));
  SimCandP = calloc(MAXPOPS*MAXALLELES,sizeof(double));
  PoolTrans = calloc(MAXALLELES*(MAXALLELES+1),sizeof(int));

  if ((Translation==NULL)||(NumAlleles==NULL)||(Q==NULL)||(modQ==NULL)||
      (Candidate==NULL)||
      (Phenotype==NULL)||(modPhenotype==NULL)||(TransPheno==NULL)|| 
      (SimCandidate==NULL)||(CandP==NULL)||
      (SimCandP==NULL)||(PoolTrans==NULL))
    {
      printf("Error in assigning memory (not enough space?)\n"); 
      FreeAllSTRAT(Geno,Individual,Translation,NumAlleles,Q,Phenotype,
		   Candidate,SimCandidate,CandP,SimCandP,PhAlFreqs,PoolTrans);  
      Kill();
    } 

  sprintf(outname1,"%s_P",STRATFILE); PvalsFile = fopen(outname1,"w");
  sprintf(outname2,"%s_fr",STRATFILE); FreqsFile = fopen(outname2,"w");
  if ((PvalsFile==NULL) || (FreqsFile==NULL))
    printf("WARNING: Unable to open output files\n");

  /*=========done setting aside memory space=====================*/

  /*Notes: Phenotype contains the full list of phenotypes, but Candidate contains
    genotype data only for those individuals with valid (ie not missing) phenotypes.
    The same is true of Q*/

  if (RANDOMIZE) Randomize(-1);    /*random number seed */
  InitCounts(counts);            /*histogram of p-values*/
  ReadQ(Q,Individual);           /*input Q-hat from structure output file*/
  CountAlleles(Geno,NumAlleles,Translation);  /*recode alleles to {0,..,1-k}*/
  CopyPhenotype(Phenotype,Individual);  /*copy phenotype data to array Phenotype
					 to simplify notation*/
  
  TranslatePhenotypes(Phenotype,TransPheno);   /* NUMPHENS set in this function*/
  /*CheckPhens(Phenotype); [I used to require all phenotypes in 0..NUMPHENS-1*/

  RemoveMissings(Q,modQ, Phenotype, modPhenotype); /*create modified Q and Phenotype
						     that removes individuals with
						     missing data*/
  
  /*had to move this line down because NUMPHENS not defined until now*/
  PhAlFreqs = calloc(MAXPOPS*NUMPHENS*MAXALLELES,sizeof(double));
  AllocateEMMemory();
  if (PhAlFreqs==NULL){printf("Error in assigning memory (not enough space?)\n");Kill(); } 


  if (LOCUSxONLY) 
    {
      start=LOCUSxONLY-1; 
      stop=LOCUSxONLY;
      if ((LOCUSxONLY < 0) || (LOCUSxONLY > NUMLOCI))
	{
	  printf("LOCUSxONLY must be a value between 1 and NUMLOCI\n");
	  Kill();
	}
    }
  else {start=0; stop=NUMLOCI;}  

  printf("\n\nStarting to estimate p-values.\n");
  printf("Assuming %d population",MAXPOPS);
  if (MAXPOPS > 1) printf("s");
  printf("\n");
  printf("%d simulated values for each locus\n\n\n",NUMSIMSTATS);
  FilePrelims(PvalsFile);
  FilePrelims(FreqsFile);
  /*==========main loop=======================================*/

  for (loc=start; loc<stop; loc++) /*test for association at each candidate*/
    {

      numindsphen = CopyCandidate(Candidate,Geno,loc,Phenotype);  /*send full pheno data*/
      /*copy candidate data to duplicate array--only for those individuals with pheno data*/
      CopyCandidate(SimCandidate,Geno,loc,Phenotype); /*send full pheno data*/
      /*Do the same thing for a second array, to be used for simulations*/

      if (NUMPHENS==2) chisq = ComputeChiSq(Candidate,&df,modPhenotype, numindsphen);
      numalls = DropRareAlleles(Candidate,NumAlleles[loc],Translation,PoolTrans,loc,numindsphen);
      if (numalls > 1)
	{
	  sumchisq += chisq; sumdf += df;   

	  teststat = ComputeStat(Candidate,modQ,modPhenotype,numalls,CandP,PhAlFreqs, numindsphen);
	  PrintFreqs(FreqsFile,CandP,PhAlFreqs,numalls,loc+1,PoolTrans, TransPheno,Markername);
	  
	  numbigger = 0;
	  for (reps = 0; reps<NUMSIMSTATS; reps++)
	    {
	      SimulateCandidateInd(SimCandidate,Candidate,modQ,CandP,numalls,numindsphen);
	      simteststat = ComputeStat(SimCandidate,modQ,modPhenotype,numalls,
					SimCandP,PhAlFreqs, numindsphen); 
	      if (simteststat > teststat) numbigger++;
	    }
	  pvalue = ((double) numbigger/NUMSIMSTATS);
	  testssofar++;
	}
      Printing(stdout,loc,chisq,numalls,teststat,pvalue,counts,1,Markername);
      Printing(PvalsFile,loc,chisq,numalls,teststat,pvalue,counts,0,Markername);
      
      if ((!(LOCUSxONLY))&&(((loc+1)%50)==0)) 
	{
	  PrintTable(stdout,counts,testssofar);
	  fclose(PvalsFile); fclose(FreqsFile);
	  PvalsFile = fopen(outname1,"a"); FreqsFile = fopen(outname2,"a");
	}
    }
 
  if (!(LOCUSxONLY)) printf("\nsum chisq = %1.3f (%d df)",sumchisq,sumdf);
    
  printf("\n\n\n");
  if (loc%50 != 0) PrintTable(stdout,counts,testssofar);
  PrintTable(PvalsFile,counts,testssofar);
  PrintAllParams(PvalsFile);
  PrintAllParams(FreqsFile);

  /*=====Closing everything down==============================*/
  FreeAllSTRAT(Geno,Individual,Translation,NumAlleles,Q,Phenotype,
	       Candidate,SimCandidate,CandP,SimCandP,PhAlFreqs,PoolTrans);  
  FreeEMMemory();  
  fclose(PvalsFile);  fclose(FreqsFile);
  return (0);
}

