import numpy as np
import math
import os, sys
from moments.LD import Numerics
from moments.LD import Util
import copy
import moments
# from moments.Misc import delayed_flush
from .. import Misc
from moments.LD.LDstats_mod import LDstats
from scipy.special import gammaln
import scipy.optimize
_counter = 0
[docs]
def sigmaD2(y, normalization=0):
"""
Compute the :math:`\\sigma_D^2` statistics normalizing by the heterozygosities
in a given population.
:param y: The input data.
:type y: :class:`LDstats` object
:param normalization: The index of the normalizing population
(normalized by pi2_i_i_i_i and H_i_i), default set to 0.
:type normalization: int, optional
"""
if normalization >= y.num_pops or normalization < 0:
raise ValueError("Normalization index must be for a present population")
out = LDstats(copy.deepcopy(y[:]), num_pops=y.num_pops, pop_ids=y.pop_ids)
for i in range(len(y))[:-1]:
out[i] /= y[i][y.names()[0].index("pi2_{0}_{0}_{0}_{0}".format(normalization))]
out[-1] /= y[-1][y.names()[1].index("H_{0}_{0}".format(normalization))]
return out
[docs]
def bin_stats(model_func, params, rho=[], theta=0.001, spread=None, kwargs={}):
"""
Computes LD statist for a given model function over bins defined by ``rho``.
Here, ``rho`` gives the bin edges, and we assume no spaces between bins. That
is, if the length of the input recombination rates is :math:`l`, the number of
bins is :math:`l-1`.
:param model_func: The model function that takes parameters in the form
``model_func(params, rho=rho, theta=theta, **kwargs)``.
:param params: The parameters to evaluate the model at.
:type params: list of floats
:param rho: The scaled recombination rate bin edges.
:type rho: list of floats
:param theta: The mutation rate
:type theta: float, optional
:param spread: A list of length rho-1 (number of bins), where each entry is an
array of length rho+1 (number of bins plus amount outside bin range to each
side). Each array must sum to one.
:type spread: list of arrays
:param kwargs: Extra keyword arguments to pass to ``model_func``.
"""
if len(rho) < 2:
raise ValueError(
"number of recombination rates (bin edges) must be greater than one"
)
rho_mids = (np.array(rho[:-1]) + np.array(rho[1:])) / 2
y_edges = model_func(params, rho=rho, theta=theta, **kwargs)
y_mids = model_func(params, rho=rho_mids, theta=theta, **kwargs)
y = [
1.0 / 6 * (y_edges[i] + y_edges[i + 1] + 4 * y_mids[i])
for i in range(len(rho_mids))
]
if spread is None:
y.append(y_edges[-1])
return LDstats(y, num_pops=y_edges.num_pops, pop_ids=y_edges.pop_ids)
else:
if len(spread) != len(rho) - 1:
raise ValueError("spread must be length of bins")
y_spread = []
for distr in spread:
if len(distr) != len(rho) + 1:
raise ValueError(
"spread distr is not the correct length (len(bins) + 2)"
)
if not np.isclose(np.sum(distr), 1):
raise ValueError("spread distributions must sum to one")
y_spread.append(
(distr[0] * y_edges[0] + distr[1:-1].dot(y) + distr[-1] * y_edges[-2])
)
y_spread.append(y_edges[-1])
return LDstats(y_spread, num_pops=y_edges.num_pops, pop_ids=y_edges.pop_ids)
[docs]
def remove_normalized_lds(y, normalization=0):
"""
Returns LD statistics with the normalizing statistic removed.
:param y: An LDstats object that has been normalized to get
:math:`\\sigma_D^2`-formatted statistics.
:type y: :class:`LDstats` object
:param normalization: The index of the normalizing population.
:type normalization: int
"""
to_delete_ld = y.names()[0].index("pi2_{0}_{0}_{0}_{0}".format(normalization))
to_delete_h = y.names()[1].index("H_{0}_{0}".format(normalization))
for i in range(len(y) - 1):
if len(y[i]) != len(y.names()[0]):
raise ValueError("Unexpected number of LD stats in data")
y[i] = np.delete(y[i], to_delete_ld)
if len(y[-1]) != len(y.names()[1]):
raise ValueError("Unexpected number of H stats in data")
y[-1] = np.delete(y[-1], to_delete_h)
return y
[docs]
def remove_normalized_data(
means, varcovs, normalization=0, num_pops=1, statistics=None
):
"""
Returns data means and covariance matrices with the normalizing
statistics removed.
:param means: List of means normalized statistics, where each entry is the
full set of statistics for a given recombination distance.
:type means: list of arrays
:param varcovs: List of the corresponding variance covariance matrices.
:type varcovs: list of arrays
:param normalization: The index of the normalizing population.
:type normalization: int
:param num_pops: The number of populations in the data set.
:type num_pops: int
"""
if len(means) != len(varcovs):
raise ValueError("Different lengths of means and covariances")
if statistics is None:
stats = Util.moment_names(num_pops)
else:
stats = copy.copy(statistics)
to_delete_ld = stats[0].index("pi2_{0}_{0}_{0}_{0}".format(normalization))
to_delete_h = stats[1].index("H_{0}_{0}".format(normalization))
ms = []
vcs = []
for i in range(len(means) - 1):
if (
len(means[i]) != len(stats[0])
or varcovs[i].shape[0] != len(stats[0])
or varcovs[i].shape[1] != len(stats[0])
):
raise ValueError(
"Data and statistics mismatch. Some statistics are missing "
"or the incorrect number of populations was given."
)
ms.append(np.delete(means[i], to_delete_ld))
vcs.append(
np.delete(np.delete(varcovs[i], to_delete_ld, axis=0), to_delete_ld, axis=1)
)
ms.append(np.delete(means[-1], to_delete_h))
# Single population data will have 1-D array for H
if varcovs[-1].size > 1:
vcs.append(
np.delete(np.delete(varcovs[-1], to_delete_h, axis=0), to_delete_h, axis=1)
)
else:
vcs.append(np.delete(varcovs[-1], to_delete_h))
if statistics is None:
return ms, vcs
else:
stats[0].pop(to_delete_ld)
stats[1].pop(to_delete_h)
return ms, vcs, stats
[docs]
def remove_nonpresent_statistics(y, statistics=[[], []]):
"""
Removes data not found in the given set of statistics.
:param y: LD statistics.
:type y: :class:`LDstats` object.
:param statistics: A list of lists for two and one locus statistics to keep.
"""
to_delete = [[], []]
for j in range(2):
for i, s in enumerate(y.names()[j]):
if s not in statistics[j]:
to_delete[j].append(i)
for i in range(len(y) - 1):
y[i] = np.delete(y[i], to_delete[0])
y[-1] = np.delete(y[-1], to_delete[1])
return y
def _multivariate_normal_pdf(x, mu, Sigma):
p = len(x)
return np.sqrt(np.linalg.det(Sigma) / (2 * math.pi) ** p) * np.exp(
-1.0 / 2 * np.dot(np.dot((x - mu).transpose(), np.linalg.inv(Sigma)), x - mu)
)
def _ll(x, mu, Sigma_inv):
"""
x = data
mu = model function output
Sigma_inv = inverse of the variance-covariance matrix
"""
if len(x) == 0:
return 0
else:
return -1.0 / 2 * np.dot(np.dot((x - mu).transpose(), Sigma_inv), x - mu)
# - len(x)*np.pi - 1./2*np.log(np.linalg.det(Sigma))
_varcov_inv_cache = {}
[docs]
def ll_over_bins(xs, mus, Sigmas):
"""
Compute the composite log-likelihood over LD and heterozygosity statistics, given
data and expectations. Inputs must be in the same order, and we assume each bin
is independent, so we sum _ll(x, mu, Sigma) over each bin.
:param xs: A list of data arrays.
:param mus: A list of model function output arrays, same length as ``xs``.
:param Sigmas: A list of var-cov matrices, same length as ``xs``.
"""
it = iter([xs, mus, Sigmas])
the_len = len(next(it))
if not all(len(l) == the_len for l in it):
raise ValueError(
"Lists of data, means, and varcov matrices must be the same length"
)
ll_vals = []
for ii in range(len(xs)):
# get var-cov inverse from cache dictionary, or compute it
recompute = True
if ii in _varcov_inv_cache and np.all(
_varcov_inv_cache[ii]["data"] == Sigmas[ii]
):
Sigma_inv = _varcov_inv_cache[ii]["inv"]
recompute = False
if recompute:
_varcov_inv_cache[ii] = {}
_varcov_inv_cache[ii]["data"] = Sigmas[ii]
if Sigmas[ii].size == 0:
Sigma_inv = np.array([])
else:
Sigma_inv = np.linalg.inv(Sigmas[ii])
_varcov_inv_cache[ii]["inv"] = Sigma_inv
# append log-likelihood for this bin
ll_vals.append(_ll(xs[ii], mus[ii], Sigma_inv))
# sum over bins to get composite log-likelihood
ll_val = np.sum(ll_vals)
return ll_val
_out_of_bounds_val = -1e12
def _object_func(
params,
model_func,
means,
varcovs,
fs=None,
rs=None,
theta=None,
u=None,
Ne=None,
lower_bound=None,
upper_bound=None,
verbose=0,
flush_delay=0,
normalization=0,
func_args=[],
func_kwargs={},
fixed_params=None,
use_afs=False,
Leff=None,
multinom=True,
ns=None,
statistics=None,
pass_Ne=False,
spread=None,
output_stream=sys.stdout,
):
global _counter
_counter += 1
# Deal with fixed parameters
params_up = _project_params_up(params, fixed_params)
# Check our parameter bounds
if lower_bound is not None:
for pval, bound in zip(params_up, lower_bound):
if bound is not None and pval < bound:
return -_out_of_bounds_val
if upper_bound is not None:
for pval, bound in zip(params_up, upper_bound):
if bound is not None and pval > bound:
return -_out_of_bounds_val
all_args = [params_up] + list(func_args)
if theta is None:
if Ne is None:
Ne = params_up[-1]
theta = 4 * Ne * u
rhos = [4 * Ne * r for r in rs]
if pass_Ne == False:
all_args = [all_args[0][:-1]]
else:
all_args = [all_args[0][:]]
else:
theta = 4 * Ne * u
rhos = [4 * Ne * r for r in rs]
else:
if Ne is not None:
rhos = [4 * Ne * r for r in rs]
## first get ll of afs
if use_afs == True:
if Leff is None:
model = theta * model_func[1](all_args[0], ns)
else:
model = Leff * theta * model_func[1](all_args[0], ns)
if fs.folded:
model = model.fold()
if multinom == True:
ll_afs = moments.Inference.ll_multinom(model, fs)
else:
ll_afs = moments.Inference.ll(model, fs)
## next get ll for LD stats
func_kwargs = {"theta": theta, "rho": rhos, "spread": spread}
stats = bin_stats(model_func[0], *all_args, **func_kwargs)
stats = sigmaD2(stats, normalization=normalization)
if statistics == None:
stats = remove_normalized_lds(stats, normalization=normalization)
else:
stats = remove_nonpresent_statistics(stats, statistics=statistics)
simp_stats = stats[:-1]
het_stats = stats[-1]
if use_afs == False:
simp_stats.append(het_stats)
## resulting ll from afs (if used) plus ll from rho bins
if use_afs == True:
result = ll_afs + ll_over_bins(means, simp_stats, varcovs)
else:
result = ll_over_bins(means, simp_stats, varcovs)
# Bad result
if np.isnan(result):
print("got bad results...")
result = _out_of_bounds_val
if (verbose > 0) and (_counter % verbose == 0):
param_str = "array([%s])" % (", ".join(["%- 12g" % v for v in params_up]))
output_stream.write(
"%-8i, %-12g, %s%s" % (_counter, result, param_str, os.linesep)
)
Misc.delayed_flush(delay=flush_delay)
return -result
def _object_func_log(log_params, *args, **kwargs):
return _object_func(np.exp(log_params), *args, **kwargs)
[docs]
def optimize_log_fmin(
p0,
data,
model_func,
rs=None,
theta=None,
u=2e-8,
Ne=None,
lower_bound=None,
upper_bound=None,
verbose=0,
flush_delay=0.5,
normalization=0,
func_args=[],
func_kwargs={},
fixed_params=None,
use_afs=False,
Leff=None,
multinom=False,
ns=None,
statistics=None,
pass_Ne=False,
spread=None,
maxiter=None,
maxfun=None,
):
"""
Optimize (using the log of) the parameters using a downhill simplex
algorithm. Initial parameters ``p0``, the data ``[means, varcovs]``,
the demographic ``model_func``, and ``rs`` to specify recombination
bin edges are required. ``Ne`` must either be specified as a keyword
argument or is included as the *last* parameter in ``p0``.
:param p0: The initial guess for demographic parameters,
demography parameters plus (optionally) Ne.
:type p0: list
:param data: The parsed data[means, varcovs, fs]. The frequency spectrum
fs is optional, and used only if use_afs=True.
- Means: The list of mean statistics within each bin
(has length ``len(rs)`` or ``len(rs) - 1`` if using AFS). If we are
not using the AFS, which is typical, the heterozygosity statistics
come last.
- varcovs: The list of varcov matrices matching the data in ``means``.
:type data: list
:param model_func: The demographic model to compute statistics
for a given rho. If we are using AFS, it's a list of the two models
[LD func, AFS func]. If we're using LD stats alone, we pass a single LD
model as a list: [LD func].
:type model_func: list
:param rs: The list of raw recombination rates defining bin edges.
:type rs: list
:param theta: The population scaled per base mutation rate
(4*Ne*mu, not 4*Ne*mu*L).
:type theta: float, optional
:param u: The raw per base mutation rate.
Cannot be used with ``theta``.
:type u: float, optional
:param Ne: The fixed effective population size to scale
u and r. If ``Ne`` is a parameter to fit, it should be the last parameter
in ``p0``.
:type Ne: float, optional
:param lower_bound: Defaults to ``None``. Constraints on the
lower bounds during optimization. These are given as lists of the same
length of the parameters.
:type lower_bound: list, optional
:param upper_bound: Defaults to ``None``. Constraints on the
upper bounds during optimization. These are given as lists of the same
length of the parameters.
:type upper_bound: list, optional
:param verbose: If an integer greater than 0, prints updates
of the optimization procedure at intervals given by that spacing.
:type verbose: int, optional
:param func_args: Additional arguments to be passed
to ``model_func``.
:type func_args: list, optional
:param func_kwargs: Additional keyword arguments to be
passed to ``model_func``.
:type func_kwargs: dict, optional
:param fixed_params: Defaults to ``None``. To fix some
parameters, this should be a list of equal length as ``p0``, with
``None`` for parameters to be fit and fixed values at corresponding
indexes.
:type fixed_params: list, optional
:param use_afs: Defaults to ``False``. We can pass a model
to compute the frequency spectrum and use
that instead of heterozygosity statistics for single-locus data.
:type use_afs: bool, optional
:param Leff: The effective length of genome from which
the fs was generated (only used if fitting to afs).
:type Leff: float, optional
:param multinom: Only used if we are fitting the AFS.
If ``True``, the likelihood is computed for an optimally rescaled FS.
If ``False``, the likelihood is computed for a fixed scaling of the FS
found by theta=4*Ne*u and Leff
:type multinom: bool, optional
:param ns: The sample size, which is only needed
if we are using the frequency spectrum, as the sample size does not
affect mean LD statistics.
:type ns: list of ints, optional
:param statistics: Defaults to ``None``, which assumes that
all statistics are present and in the conventional default order. If
the data is missing some statistics, we must specify which statistics
are present using the subset of statistic names given by
``moments.LD.Util.moment_names(num_pops)``.
:type statistics: list, optional
:param pass_Ne: Defaults to ``False``. If ``True``, the
demographic model includes ``Ne`` as a parameter (in the final position
of input parameters).
:type pass_Ne: bool, optional
:param maxiter: Defaults to None. Maximum number of iterations to perform.
:type maxiter: int
:param maxfun: Defaults to None. Maximum number of function evaluations to make.
:type maxfun: int
"""
output_stream = sys.stdout
means = data[0]
varcovs = data[1]
if use_afs is True:
try:
fs = data[2]
except IndexError:
raise ValueError(
"if use_afs=True, need to pass frequency spectrum, "
"as data=[means,varcovs,fs]"
)
if ns is None:
raise ValueError("need to set ns if we are fitting frequency spectrum")
else:
fs = None
if rs is None:
raise ValueError("need to pass rs as bin edges")
# get num_pops
if Ne is None:
if not pass_Ne:
y = model_func[0](p0[:-1])
else:
y = model_func[0](p0[:])
else:
y = model_func[0](p0)
num_pops = y.num_pops
# remove normalized statisticsd
ms = copy.copy(means)
vcs = copy.copy(varcovs)
if statistics is None:
# if statistics is not None, assume we already filtered out the data
ms, vcs = remove_normalized_data(
ms, vcs, normalization=normalization, num_pops=num_pops
)
args = (
model_func,
ms,
vcs,
fs,
rs,
theta,
u,
Ne,
lower_bound,
upper_bound,
verbose,
flush_delay,
normalization,
func_args,
func_kwargs,
fixed_params,
use_afs,
Leff,
multinom,
ns,
statistics,
pass_Ne,
spread,
output_stream,
)
p0 = _project_params_down(p0, fixed_params)
outputs = scipy.optimize.fmin(
_object_func_log,
np.log(p0),
args=args,
full_output=True,
disp=False,
maxiter=maxiter,
maxfun=maxfun,
)
xopt, fopt, iter, funcalls, warnflag = outputs
xopt = _project_params_up(np.exp(xopt), fixed_params)
return xopt, fopt
[docs]
def optimize_log_powell(
p0,
data,
model_func,
rs=None,
theta=None,
u=2e-8,
Ne=None,
lower_bound=None,
upper_bound=None,
verbose=0,
flush_delay=0.5,
normalization=0,
func_args=[],
func_kwargs={},
fixed_params=None,
use_afs=False,
Leff=None,
multinom=False,
ns=None,
statistics=None,
pass_Ne=False,
spread=None,
maxiter=None,
maxfun=None,
):
"""
Optimize (using the log of) the parameters using the modified Powell's
method, which optimizes slices of parameter space sequentially. Initial
parameters ``p0``, the data ``[means, varcovs]``,
the demographic ``model_func``, and ``rs`` to specify recombination
bin edges are required. ``Ne`` must either be specified as a keyword
argument or is included as the *last* parameter in ``p0``.
:param p0: The initial guess for demographic parameters,
demography parameters plus (optionally) Ne.
:type p0: list
:param data: The parsed data[means, varcovs, fs]. The frequency spectrum
fs is optional, and used only if use_afs=True.
- Means: The list of mean statistics within each bin
(has length ``len(rs)`` or ``len(rs) - 1`` if using AFS). If we are
not using the AFS, which is typical, the heterozygosity statistics
come last.
- varcovs: The list of varcov matrices matching the data in ``means``.
:type data: list
:param model_func: The demographic model to compute statistics
for a given rho. If we are using AFS, it's a list of the two models
[LD func, AFS func]. If we're using LD stats alone, we pass a single LD
model as a list: [LD func].
:type model_func: list
:param rs: The list of raw recombination rates defining bin edges.
:type rs: list
:param theta: The population scaled per base mutation rate
(4*Ne*mu, not 4*Ne*mu*L).
:type theta: float, optional
:param u: The raw per base mutation rate.
Cannot be used with ``theta``.
:type u: float, optional
:param Ne: The fixed effective population size to scale
u and r. If ``Ne`` is a parameter to fit, it should be the last parameter
in ``p0``.
:type Ne: float, optional
:param lower_bound: Defaults to ``None``. Constraints on the
lower bounds during optimization. These are given as lists of the same
length of the parameters.
:type lower_bound: list, optional
:param upper_bound: Defaults to ``None``. Constraints on the
upper bounds during optimization. These are given as lists of the same
length of the parameters.
:type upper_bound: list, optional
:param verbose: If an integer greater than 0, prints updates
of the optimization procedure at intervals given by that spacing.
:type verbose: int, optional
:param func_args: Additional arguments to be passed
to ``model_func``.
:type func_args: list, optional
:param func_kwargs: Additional keyword arguments to be
passed to ``model_func``.
:type func_kwargs: dict, optional
:param fixed_params: Defaults to ``None``. To fix some
parameters, this should be a list of equal length as ``p0``, with
``None`` for parameters to be fit and fixed values at corresponding
indexes.
:type fixed_params: list, optional
:param use_afs: Defaults to ``False``. We can pass a model
to compute the frequency spectrum and use
that instead of heterozygosity statistics for single-locus data.
:type use_afs: bool, optional
:param Leff: The effective length of genome from which
the fs was generated (only used if fitting to afs).
:type Leff: float, optional
:param multinom: Only used if we are fitting the AFS.
If ``True``, the likelihood is computed for an optimally rescaled FS.
If ``False``, the likelihood is computed for a fixed scaling of the FS
found by theta=4*Ne*u and Leff
:type multinom: bool, optional
:param ns: The sample size, which is only needed
if we are using the frequency spectrum, as the sample size does not
affect mean LD statistics.
:type ns: list of ints, optional
:param statistics: Defaults to ``None``, which assumes that
all statistics are present and in the conventional default order. If
the data is missing some statistics, we must specify which statistics
are present using the subset of statistic names given by
``moments.LD.Util.moment_names(num_pops)``.
:type statistics: list, optional
:param pass_Ne: Defaults to ``False``. If ``True``, the
demographic model includes ``Ne`` as a parameter (in the final position
of input parameters).
:type pass_Ne: bool, optional
:param maxiter: Defaults to None. Maximum number of iterations to perform.
:type maxiter: int
:param maxfun: Defaults to None. Maximum number of function evaluations to make.
:type maxfun: int
"""
output_stream = sys.stdout
means = data[0]
varcovs = data[1]
if use_afs:
try:
fs = data[2]
except IndexError:
raise ValueError(
"if use_afs=True, need to pass frequency spectrum in data=[means,varcovs,fs]"
)
if ns is None:
raise ValueError("need to set ns if we are fitting frequency spectrum")
else:
fs = None
if rs is None:
raise ValueError("need to pass rs as bin edges")
# get num_pops
if Ne is None:
if not pass_Ne:
y = model_func[0](p0[:-1])
else:
y = model_func[0](p0[:])
else:
y = model_func[0](p0)
num_pops = y.num_pops
# remove normalized statistics
ms = copy.copy(means)
vcs = copy.copy(varcovs)
if statistics is None:
# if statistics is not None, assume we already filtered out the data
ms, vcs = remove_normalized_data(
ms, vcs, normalization=normalization, num_pops=num_pops
)
args = (
model_func,
ms,
vcs,
fs,
rs,
theta,
u,
Ne,
lower_bound,
upper_bound,
verbose,
flush_delay,
normalization,
func_args,
func_kwargs,
fixed_params,
use_afs,
Leff,
multinom,
ns,
statistics,
pass_Ne,
spread,
output_stream,
)
p0 = _project_params_down(p0, fixed_params)
outputs = scipy.optimize.fmin_powell(
_object_func_log,
np.log(p0),
args=args,
full_output=True,
disp=False,
maxiter=maxiter,
maxfun=maxfun,
)
xopt, fopt, direc, iter, funcalls, warnflag = outputs
xopt = _project_params_up(np.exp(xopt), fixed_params)
return xopt, fopt
[docs]
def optimize_log_lbfgsb(
p0,
data,
model_func,
rs=None,
theta=None,
u=2e-8,
Ne=None,
lower_bound=None,
upper_bound=None,
verbose=0,
flush_delay=0.5,
normalization=0,
func_args=[],
func_kwargs={},
fixed_params=None,
use_afs=False,
Leff=None,
multinom=False,
ns=None,
statistics=None,
pass_Ne=False,
spread=None,
maxiter=40000,
epsilon=1e-3,
pgtol=1e-5,
):
"""
Optimize (using the log of) the parameters using the modified Powell's
method, which optimizes slices of parameter space sequentially. Initial
parameters ``p0``, the data ``[means, varcovs]``,
the demographic ``model_func``, and ``rs`` to specify recombination
bin edges are required. ``Ne`` must either be specified as a keyword
argument or is included as the *last* parameter in ``p0``.
It is best at burrowing down a single minimum. This method is
better than optimize_log if the optimum lies at one or more of the
parameter bounds. However, if your optimum is not on the bounds, this
method may be much slower.
Because this works in log(params), it cannot explore values of params < 0.
It should also perform better when parameters range over scales.
The L-BFGS-B method was developed by Ciyou Zhu, Richard Byrd, and Jorge
Nocedal. The algorithm is described in:
- R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound
Constrained Optimization, (1995), SIAM Journal on Scientific and
Statistical Computing , 16, 5, pp. 1190-1208.
- C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B,
FORTRAN routines for large scale bound constrained optimization (1997),
ACM Transactions on Mathematical Software, Vol 23, Num. 4, pp. 550-560.
:param p0: The initial guess for demographic parameters,
demography parameters plus (optionally) Ne.
:type p0: list
:param data: The parsed data[means, varcovs, fs]. The frequency spectrum
fs is optional, and used only if use_afs=True.
- Means: The list of mean statistics within each bin
(has length ``len(rs)`` or ``len(rs) - 1`` if using AFS). If we are
not using the AFS, which is typical, the heterozygosity statistics
come last.
- varcovs: The list of varcov matrices matching the data in ``means``.
:type data: list
:param model_func: The demographic model to compute statistics
for a given rho. If we are using AFS, it's a list of the two models
[LD func, AFS func]. If we're using LD stats alone, we pass a single LD
model as a list: [LD func].
:type model_func: list
:param rs: The list of raw recombination rates defining bin edges.
:type rs: list
:param theta: The population scaled per base mutation rate
(4*Ne*mu, not 4*Ne*mu*L).
:type theta: float, optional
:param u: The raw per base mutation rate.
Cannot be used with ``theta``.
:type u: float, optional
:param Ne: The fixed effective population size to scale
u and r. If ``Ne`` is a parameter to fit, it should be the last parameter
in ``p0``.
:type Ne: float, optional
:param lower_bound: Defaults to ``None``. Constraints on the
lower bounds during optimization. These are given as lists of the same
length of the parameters.
:type lower_bound: list, optional
:param upper_bound: Defaults to ``None``. Constraints on the
upper bounds during optimization. These are given as lists of the same
length of the parameters.
:type upper_bound: list, optional
:param verbose: If an integer greater than 0, prints updates
of the optimization procedure at intervals given by that spacing.
:type verbose: int, optional
:param func_args: Additional arguments to be passed
to ``model_func``.
:type func_args: list, optional
:param func_kwargs: Additional keyword arguments to be
passed to ``model_func``.
:type func_kwargs: dict, optional
:param fixed_params: Defaults to ``None``. To fix some
parameters, this should be a list of equal length as ``p0``, with
``None`` for parameters to be fit and fixed values at corresponding
indexes.
:type fixed_params: list, optional
:param use_afs: Defaults to ``False``. We can pass a model
to compute the frequency spectrum and use
that instead of heterozygosity statistics for single-locus data.
:type use_afs: bool, optional
:param Leff: The effective length of genome from which
the fs was generated (only used if fitting to afs).
:type Leff: float, optional
:param multinom: Only used if we are fitting the AFS.
If ``True``, the likelihood is computed for an optimally rescaled FS.
If ``False``, the likelihood is computed for a fixed scaling of the FS
found by theta=4*Ne*u and Leff
:type multinom: bool, optional
:param ns: The sample size, which is only needed
if we are using the frequency spectrum, as the sample size does not
affect mean LD statistics.
:type ns: list of ints, optional
:param statistics: Defaults to ``None``, which assumes that
all statistics are present and in the conventional default order. If
the data is missing some statistics, we must specify which statistics
are present using the subset of statistic names given by
``moments.LD.Util.moment_names(num_pops)``.
:type statistics: list, optional
:param pass_Ne: Defaults to ``False``. If ``True``, the
demographic model includes ``Ne`` as a parameter (in the final position
of input parameters).
:type pass_Ne: bool, optional
:param maxiter: Defaults to 40,000. Maximum number of iterations to perform.
:type maxiter: int
:param epsilon: Step-size to use for finite-difference derivatives.
:type pgtol: float
:param pgtol: Convergence criterion for optimization. For more info,
see help(scipy.optimize.fmin_l_bfgs_b)
:type pgtol: float
"""
output_stream = sys.stdout
means = data[0]
varcovs = data[1]
if use_afs:
try:
fs = data[2]
except IndexError:
raise ValueError(
"if use_afs=True, need to pass frequency spectrum in data=[means,varcovs,fs]"
)
if ns is None:
raise ValueError("need to set ns if we are fitting frequency spectrum")
else:
fs = None
if rs is None:
raise ValueError("need to pass rs as bin edges")
# get num_pops
if Ne is None:
if not pass_Ne:
y = model_func[0](p0[:-1])
else:
y = model_func[0](p0[:])
else:
y = model_func[0](p0)
num_pops = y.num_pops
# remove normalized statistics
ms = copy.copy(means)
vcs = copy.copy(varcovs)
if statistics is None:
# if statistics is not None, assume we already filtered out the data
ms, vcs = remove_normalized_data(
ms, vcs, normalization=normalization, num_pops=num_pops
)
args = (
model_func,
ms,
vcs,
fs,
rs,
theta,
u,
Ne,
lower_bound,
upper_bound,
verbose,
flush_delay,
normalization,
func_args,
func_kwargs,
fixed_params,
use_afs,
Leff,
multinom,
ns,
statistics,
pass_Ne,
spread,
output_stream,
)
# Make bounds list. For this method it needs to be in terms of log params.
if lower_bound is None:
lower_bound = [None] * len(p0)
else:
lower_bound = np.log(lower_bound)
lower_bound[np.isnan(lower_bound)] = None
lower_bound = _project_params_down(lower_bound, fixed_params)
if upper_bound is None:
upper_bound = [None] * len(p0)
else:
upper_bound = np.log(upper_bound)
upper_bound[np.isnan(upper_bound)] = None
upper_bound = _project_params_down(upper_bound, fixed_params)
bounds = list(zip(lower_bound, upper_bound))
p0 = _project_params_down(p0, fixed_params)
outputs = scipy.optimize.fmin_l_bfgs_b(
_object_func_log,
np.log(p0),
bounds=bounds,
epsilon=epsilon,
args=args,
iprint=-1,
pgtol=pgtol,
maxiter=maxiter,
approx_grad=True,
)
xopt, fopt, info_dict = outputs
xopt = _project_params_up(np.exp(xopt), fixed_params)
return xopt, fopt
def _project_params_down(pin, fixed_params):
"""
Eliminate fixed parameters from pin.
"""
if fixed_params is None:
return pin
if len(pin) != len(fixed_params):
raise ValueError(
"fixed_params list must have same length as input " "parameter array."
)
pout = []
for ii, (curr_val, fixed_val) in enumerate(zip(pin, fixed_params)):
if fixed_val is None:
pout.append(curr_val)
return np.array(pout)
def _project_params_up(pin, fixed_params):
"""
Fold fixed parameters into pin.
"""
if fixed_params is None:
return pin
if np.isscalar(pin):
pin = [pin]
pout = np.zeros(len(fixed_params))
orig_ii = 0
for out_ii, val in enumerate(fixed_params):
if val is None:
pout[out_ii] = pin[orig_ii]
orig_ii += 1
else:
pout[out_ii] = fixed_params[out_ii]
return pout