#!/usr/bin/python

"""
haplofreq.py calculates the frequency and cumulative probability
(p-values) of four statistics by parsing the output of haploconfig.

Command line parameters:
-i input_file_name
-o output_file_name

The input file must be specified in each processing; however, by default, 
if the output file name is not specified, this program will output results 
to the screen.

The following arguments are optional:
-a alpha_value (a float number between 0 and 1)
-s specifying which summary statistic is responsible for output according 
to the alpha_value in '-a' option; 
  'C' or 'c' for haplotype configuration, 
  'M' or 'm' for M statistic,
  'K' or 'k' for K statistic,
  'H' or 'h' for H statistic
the default (-s not specified) is the haplotype configuration; however, in 
order to output alpha value, -a must be specified 

--version give the version of this script

Sample usage:

./haplofreq.py -i haploconfig_output -o freq_table   
will produce a table like table 6 in the paper by Innan et al. (2005)
in the file "freq_table"

./haplofreq.py -i haploconfig_output -o K_pvalue -a 0.05 -s K
will produce all configurations with haplotype number test p-values less than
0.025 in each side (two-tailed test) in the file "K_pvalue"

Please refer to the manual for additional help information. 

Kangyu Zhang (kangyuzh@usc.edu)
Mar. 1 2005

July 28 2007 v1.1 corrected a bug related to output option -s

"""

import sys,string
from optparse import OptionParser

class haplofreq:
    """here is all it happens, in function processing"""
    def __init__(self,inputfile_name,outputfile_name,alpha,alpha_stat):
        self.inputfile=open(inputfile_name,'r')
        if outputfile_name!='':
        	self.outputfile=open(outputfile_name,'w')
        else:
        	self.outputfile=sys.stdout
        self.linepattern=''
        self.total_arg_number=0

        self.config_freq_dict={}
        self.config_cumu_dict={}
        
        self.M_freq_dict={}
        self.M_larger_dict={}
        self.M_less_dict={}
        self.K_freq_dict={}
        self.K_larger_dict={}
        self.K_less_dict={}
        self.H_freq_dict={}
        self.H_larger_dict={}
        self.H_less_dict={}

        self.arg_list=[] #the list containing every ARG
        self.alpha=alpha
        self.alpha_stat=alpha_stat

    class arg:
        """the feature of one ancestral recombination graph"""
        def __init__(self,linepattern):
            self.linepattern_list=linepattern.split('\t')
            self.graphno=0
            self.theta=0
            self.rho=0
            self.beta=0
            self.gamma=0
            self.timetomrca=0
            self.config=''
            self.M=0
            self.K=0
            self.H=0.0

        def processing(self):
            """processing and store the statistic in class arg"""
            self.graphno=self.linepattern_list[1]
            self.theta=self.linepattern_list[3]
            self.rho=self.linepattern_list[5]
            self.beta=self.linepattern_list[7]
            self.gamma=self.linepattern_list[9]
            self.timetomrca=self.linepattern_list[11]
            self.config=self.linepattern_list[13]
            self.M=int(self.linepattern_list[15])
            self.K=int(self.linepattern_list[17])
            self.H=float(self.linepattern_list[19])
    
    def processing(self):
        self.inputfile.readline()  #omit the first line containing commandline argument
        while 1:
            self.linepattern=self.inputfile.readline()
            if self.linepattern=='':
                break
            if self.linepattern[0:7]=='GraphNo':
                #print self.linepattern
                self.instance_arg=self.arg(self.linepattern)
            
                self.instance_arg.processing()
                self.total_arg_number+=1
                if self.config_freq_dict.has_key(self.instance_arg.config)==False:
                    self.config_freq_dict[self.instance_arg.config]=1
                    self.arg_list.append(self.instance_arg)
                else:
                    self.config_freq_dict[self.instance_arg.config]+=1
                if self.M_freq_dict.has_key(self.instance_arg.M)==False:
                    self.M_freq_dict[self.instance_arg.M]=1
                else:
                    self.M_freq_dict[self.instance_arg.M]+=1
                if self.K_freq_dict.has_key(self.instance_arg.K)==False:
                    self.K_freq_dict[self.instance_arg.K]=1
                else:
                    self.K_freq_dict[self.instance_arg.K]+=1
                if self.H_freq_dict.has_key(self.instance_arg.H)==False:
                    self.H_freq_dict[self.instance_arg.H]=1
                else:
                    self.H_freq_dict[self.instance_arg.H]+=1
        self.inputfile.close()
        #calculate the cumulative probability for config, M, K and H
        self.config_frequency_calculation()
        self.M_larger_dict=self.MKH_larger_frequency_calculation(self.M_freq_dict)
        self.K_larger_dict=self.MKH_larger_frequency_calculation(self.K_freq_dict)
        self.H_larger_dict=self.MKH_larger_frequency_calculation(self.H_freq_dict)
        self.M_less_dict=self.MKH_less_frequency_calculation(self.M_freq_dict)
        self.K_less_dict=self.MKH_less_frequency_calculation(self.K_freq_dict)
        self.H_less_dict=self.MKH_less_frequency_calculation(self.H_freq_dict)
        #output the format
        self.output()

    def config_frequency_calculation(self):
        """calculate haplotype configuration's pvalue based on haplotype configuration frequency"""
        for config in self.config_freq_dict.keys():
            config_freq=self.config_freq_dict[config]
            self.config_cumu_dict[config]=0
            for freq in self.config_freq_dict.values():
                if config_freq>=freq:
                    self.config_cumu_dict[config]+=freq

    def MKH_larger_frequency_calculation(self,MKH_freq_dict):
        """calculate P[M>=M(c)]"""
        MKH_larger_dict={}
        for MKH in MKH_freq_dict.keys():
            MKH_freq=MKH_freq_dict[MKH]
            MKH_larger_dict[MKH]=MKH_freq
            for MKH_compare in MKH_freq_dict.keys():
                if MKH_compare>MKH:
                    MKH_larger_dict[MKH]+=MKH_freq_dict[MKH_compare]
        return MKH_larger_dict

    def MKH_less_frequency_calculation(self,MKH_freq_dict):
        """calculate P[M<=M(c)]"""
        MKH_less_dict={}
        for MKH in MKH_freq_dict.keys():
            MKH_freq=MKH_freq_dict[MKH]
            MKH_less_dict[MKH]=MKH_freq
            for MKH_compare in MKH_freq_dict.keys():
                if MKH_compare<MKH:
                    MKH_less_dict[MKH]+=MKH_freq_dict[MKH_compare]
        return MKH_less_dict

    def sort_arg_list(self,statistic):
        """sort the arg_list according to different criteria"""
        #currently using the selection sort
        
        if statistic=='C':
            index=0
            while index<len(self.arg_list):
                index_min=index
                index_inner=index+1
                while index_inner<len(self.arg_list):
                    if self.config_freq_dict[self.arg_list[index_inner].config]<self.config_freq_dict[self.arg_list[index_min].config]:
                        index_min=index_inner
                    index_inner+=1
                temp=self.arg_list[index]
                self.arg_list[index]=self.arg_list[index_min]
                self.arg_list[index_min]=temp
                index+=1
        elif statistic=='M':
            index=0
            while index<len(self.arg_list):
                index_max=index
                index_inner=index+1
                while index_inner<len(self.arg_list):
                    if self.M_larger_dict[self.arg_list[index_inner].M]>self.M_larger_dict[self.arg_list[index_max].M]:
                        index_max=index_inner
                    index_inner+=1
                temp=self.arg_list[index]
                self.arg_list[index]=self.arg_list[index_max]
                self.arg_list[index_max]=temp
                index+=1
        elif statistic=='K':
            index=0
            while index<len(self.arg_list):
                index_min=index
                index_inner=index+1
                while index_inner<len(self.arg_list):
                    if self.K_less_dict[self.arg_list[index_inner].K]<self.K_less_dict[self.arg_list[index_min].K]:
                        index_min=index_inner
                    index_inner+=1
                temp=self.arg_list[index]
                self.arg_list[index]=self.arg_list[index_min]
                self.arg_list[index_min]=temp
                index+=1
        elif statistic=='H':
            index=0
            while index<len(self.arg_list):
                index_min=index
                index_inner=index+1
                while index_inner<len(self.arg_list):
                    if self.H_less_dict[self.arg_list[index_inner].H]<self.H_less_dict[self.arg_list[index_min].H]:
                        index_min=index_inner
                    index_inner+=1
                temp=self.arg_list[index]
                self.arg_list[index]=self.arg_list[index_min]
                self.arg_list[index_min]=temp
                index+=1
        else:
            print "error!"
            sys.exit(2)

    def output(self):
        """output function"""
        
        self.outputfile.write("Command line options: %s\n"%(string.join(sys.argv)))
        if self.alpha==-1.0: #no alpha value, output all values, sort by config frequency
            self.sort_arg_list('C')
            self.outputfile.write("Configuration(c)\tP[C==c]\tP[C<=c](Cumulative probability)\tP[M>=M(c)]\tP[M<=M(c)]\tP[K<=K(c)]\tP[K>=K(c)]\tP[H<=H(c)]\tP[H>=H(c)]\n")
            for item in self.arg_list:
                config=item.config
                config_list=config[:-1].split(' ')
                config_comma=string.join(config_list,',')
                self.outputfile.write("(%s)\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(config_comma,str(float(self.config_freq_dict[config])/float(self.total_arg_number)),str(float(self.config_cumu_dict[config])/float(self.total_arg_number)),str(float(self.M_larger_dict[item.M])/float(self.total_arg_number)),str(float(self.M_less_dict[item.M])/float(self.total_arg_number)),str(float(self.K_less_dict[item.K])/float(self.total_arg_number)),str(float(self.K_larger_dict[item.K])/float(self.total_arg_number)),str(float(self.H_less_dict[item.H])/float(self.total_arg_number)),str(float(self.H_larger_dict[item.H])/float(self.total_arg_number))))
        
        elif self.alpha_stat in ('C','c'):
            self.sort_arg_list('C')
            self.outputfile.write("Configuration(c)\tP[C==c]\tP[C<=c]\tM\tK\tH\n")
            for item in self.arg_list:
                config=item.config
                config_list=config[:-1].split(' ')
                config_comma=string.join(config_list,',')
                config_pvalue=float(self.config_cumu_dict[config])/float(self.total_arg_number)
                if config_pvalue<=self.alpha:
                    self.outputfile.write("(%s)\t%s\t%s\t%s\t%s\t%s\n"%(config_comma,str(float(self.config_freq_dict[config])/float(self.total_arg_number)),str(config_pvalue),str(item.M),str(item.K),str(item.H)))
        
        elif self.alpha_stat in ('M','m'):
            self.sort_arg_list('M') #sort by M in reverse order
            self.outputfile.write("M\tP[M>=M(c)]\tP[M<=M(c)]\tConfiguration(c)\tP[C<=c]\tK\tH\n")
            for item in self.arg_list:
                config=item.config
                config_list=config[:-1].split(' ')
                config_comma=','.join(config_list)
                M=item.M
                M_pvalue_left=float(self.M_larger_dict[M])/float(self.total_arg_number)
                M_pvalue_right=float(self.M_less_dict[M])/float(self.total_arg_number)
                if M_pvalue_left<=float(self.alpha)/2 or M_pvalue_right<=float(self.alpha)/2:
                    self.outputfile.write("%s\t%s\t%s\t(%s)\t%s\t%s\t%s\n"%(str(M),str(M_pvalue_left),str(M_pvalue_right),str(config_comma),str(float(self.config_freq_dict[config])/float(self.total_arg_number)),str(item.K),str(item.H)))
        
        elif self.alpha_stat in ('K','k'):
            self.sort_arg_list('K')
            self.outputfile.write("K\tP[K<=K(c)]\tP[K>=K(c)]\tConfiguration(c)\tP[C<=c]\tM\tH\n")
            for item in self.arg_list:
                config=item.config
                config_list=config[:-1].split(' ')
                config_comma=','.join(config_list)
                K=item.K
                K_pvalue_left=float(self.K_less_dict[K])/float(self.total_arg_number)
                K_pvalue_right=float(self.K_larger_dict[K])/float(self.total_arg_number)
                if K_pvalue_left<=float(self.alpha)/2 or K_pvalue_right<=float(self.alpha)/2:
                    self.outputfile.write("%s\t%s\t%s\t(%s)\t%s\t%s\t%s\n"%(str(K),str(K_pvalue_left),str(K_pvalue_right),str(config_comma),str(float(self.config_freq_dict[config])/float(self.total_arg_number)),str(item.M),str(item.H)))
        
        elif self.alpha_stat in ('H','h'):
            self.sort_arg_list('H')
            self.outputfile.write("H\tP[H<=H(c)]\tP[H>=H(c)]\tConfiguration(c)\tP[C<=c]\tM\tK\n")
            for item in self.arg_list:
                config=item.config
                config_list=config[:-1].split(' ')
                config_comma=','.join(config_list)
                H=item.H
                H_pvalue_left=float(self.H_less_dict[H])/float(self.total_arg_number)
                H_pvalue_right=float(self.H_larger_dict[H])/float(self.total_arg_number)
                if H_pvalue_left<=float(self.alpha)/2 or H_pvalue_right<=float(self.alpha)/2:
                    self.outputfile.write("%s\t%s\t%s\t(%s)\t%s\t%s\t%s\n"%(str(H),str(H_pvalue_left),str(H_pvalue_right),str(config_comma),str(float(self.config_freq_dict[config])/float(self.total_arg_number)),str(item.M),str(item.K)))
        
        self.outputfile.close()

def main():
    #the main function
    
    if sys.argv[1:]==[]:
    	print __doc__
    	sys.exit(2)
    
    parser=OptionParser(version="%prog 1.01, by Kangyu Zhang")
    parser.add_option("-i", "--input", action="store", type="string", dest="inputfile_name", help="specifying the input file name", default='')
    parser.add_option("-o", "--output", action="store", type="string", dest="outputfile_name", help="specifying the output file name", default='')
    parser.add_option("-s", "--alpha_statistic", action="store", type="string", dest="alpha_statistic", help="specifying which statistic to use", default='c')
    parser.add_option("-a", "--alpha", action="store", type="float", dest="alpha", help="specifying the alpha value", default=-1.0)
    (opts,args)=parser.parse_args()
    if opts.inputfile_name=='': #by default, output to screen
    	parser.error("Please use the option -i inputfile_name to specify the input file name")
    if opts.alpha>1.0 or opts.alpha<0.0 and opts.alpha!=-1.0:
    	parser.error("The alpha value should be a float number between 0 and 1")
    if not opts.alpha_statistic in ('C','c','K','k','M','m','H','h'):
    	parser.error("Please specify the correct statistic, chosen from 'c','m','k','h'\n or their respective capital letters")
    
    instance=haplofreq(opts.inputfile_name,opts.outputfile_name,opts.alpha,opts.alpha_statistic)
    instance.processing()
    sys.exit(0)

if __name__=='__main__':
	main()
	
