1
|
|
2
|
|
3
|
import argparse
|
4
|
import numpy as np
|
5
|
from scipy.stats import chi2
|
6
|
import statsmodels.api as sm
|
7
|
from scipy.optimize import minimize as optim
|
8
|
|
9
|
def set_options():
|
10
|
myparser=argparse.ArgumentParser(description='Estimate hapflk distribution and/or calculate hapFLK p-values')
|
11
|
myparser.add_argument('infiles',metavar='file',type=str,help='An input file',nargs='+')
|
12
|
myparser.add_argument('K',metavar='K',type=float,help='Number of clusters')
|
13
|
myparser.add_argument('N',metavar='N',type=float,help='Number of populations')
|
14
|
myparser.add_argument('--pars',nargs=2,metavar=('mu','beta'),type=float,help='regression parameters',default=None)
|
15
|
myparser.add_argument('--optimize',help='Optimize chi-squared df',action="store_true",default=False)
|
16
|
return myparser
|
17
|
|
18
|
def estimate_hapflk_qreg(sample,K,Npop,probs=np.linspace(0.05,0.95,num=19)):
|
19
|
|
20
|
oq=np.percentile(sample,q=list(100*probs))
|
21
|
|
22
|
mydf=(K-1)*(Npop-1)
|
23
|
tq=chi2.ppf(probs,df=mydf)
|
24
|
|
25
|
reg=sm.RLM(tq,np.vstack([np.ones(len(oq)),oq]).T)
|
26
|
res=reg.fit()
|
27
|
mu,beta=res.params
|
28
|
return {'mu':mu,'beta':beta,'df':mydf}
|
29
|
|
30
|
def qfunc(ndf,oq,probs):
|
31
|
tq=chi2.ppf(probs,df=ndf)
|
32
|
reg=sm.RLM(tq,np.vstack([np.ones(len(oq)),oq]).T)
|
33
|
res=reg.fit()
|
34
|
return np.sum(np.power(res.resid,2))
|
35
|
|
36
|
def optimize_hapflk_qreg(sample,K,Npop,probs=np.linspace(0.05,0.95,num=19)):
|
37
|
oq=np.percentile(sample,q=list(100*probs))
|
38
|
res=optim(qfunc,K,args=(oq,probs))
|
39
|
mydf=res.x[0]
|
40
|
tq=chi2.ppf(probs,df=mydf)
|
41
|
|
42
|
reg=sm.RLM(tq,np.vstack([np.ones(len(oq)),oq]).T)
|
43
|
res=reg.fit()
|
44
|
mu,beta=res.params
|
45
|
return {'mu':mu,'beta':beta,'df':mydf}
|
46
|
|
47
|
def scale_hapflk_sample(sample,params):
|
48
|
scaled_sample=params['mu']+params['beta']*sample
|
49
|
if scaled_sample < 0:
|
50
|
scaled_sample=0
|
51
|
pvalue=chi2.sf(scaled_sample,df=params['df'])
|
52
|
return scaled_sample,pvalue
|
53
|
|
54
|
|
55
|
def main():
|
56
|
parser=set_options()
|
57
|
args=parser.parse_args()
|
58
|
flog=open('scaling_hapflk.log','w')
|
59
|
datasets=[]
|
60
|
filenames=[]
|
61
|
for fic in args.infiles:
|
62
|
print fic
|
63
|
dset=np.genfromtxt(fic,dtype=None,names=True)
|
64
|
datasets.append(dset)
|
65
|
filenames.append(fic)
|
66
|
if args.pars==None:
|
67
|
hflk_sample=np.concatenate([x['hapflk'] for x in datasets])
|
68
|
if args.optimize:
|
69
|
reg_par=optimize_hapflk_qreg(hflk_sample,args.K,args.N)
|
70
|
else:
|
71
|
reg_par=estimate_hapflk_qreg(hflk_sample,args.K,args.N)
|
72
|
print reg_par
|
73
|
else:
|
74
|
reg_par={'mu':args.pars[0],'beta':args.pars[1],'df':(args.K-1)*(args.N-1)}
|
75
|
|
76
|
print >>flog,"regression parameters"
|
77
|
for k,v in reg_par.items():
|
78
|
print >>flog,k,v
|
79
|
|
80
|
for i,d in enumerate(datasets):
|
81
|
with open(filenames[i]+'_sc','w') as fout:
|
82
|
print >>fout,'rs','chr','pos','hapflk','hapflk_scaled','pvalue'
|
83
|
for x in d:
|
84
|
xs=scale_hapflk_sample(x['hapflk'],reg_par)
|
85
|
print >>fout,x['rs'],x['chr'],x['pos'],x['hapflk'],xs[0],xs[1]
|
86
|
|
87
|
|
88
|
if __name__=='__main__':
|
89
|
main()
|