# -*- coding: utf-8 -*-
"""
Site frequency spectra.
See also the examples at:
- http://nbviewer.ipython.org/github/alimanfoo/anhima/blob/master/examples/sf.ipynb
""" # noqa
from __future__ import division, print_function, absolute_import
# third party dependencies
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
[docs]def site_frequency_spectrum(derived_ac):
"""Calculate the site frequency spectrum, given derived allele counts for a
set of biallelic variant sites.
Parameters
----------
derived_ac : array_like, int
A 1-dimensional array of shape (n_variants,) where each array
element holds the count of derived alleles found for a single variant
across some set of samples.
Returns
-------
sfs : ndarray, int
An array of integers where the value of the kth element is the
number of variant sites with k derived alleles.
See Also
--------
site_frequency_spectrum_scaled, site_frequency_spectrum_folded,
site_frequency_spectrum_folded_scaled, plot_site_frequency_spectrum
"""
# check input
derived_ac = np.asarray(derived_ac)
assert derived_ac.ndim == 1
# calculate frequency spectrum
sfs = np.bincount(derived_ac)
return sfs
[docs]def site_frequency_spectrum_folded(biallelic_ac):
"""Calculate the folded site frequency spectrum, given reference and
alternate allele counts for a set of biallelic variants.
Parameters
----------
biallelic_ac : array_like int
A 2-dimensional array of shape (n_variants, 2), where each row
holds the reference and alternate allele counts for a single
biallelic variant across some set of samples.
Returns
-------
sfs_folded : ndarray, int
An array of integers where the value of the kth element is the
number of variant sites with k observations of the minor allele.
See Also
--------
site_frequency_spectrum, site_frequency_spectrum_scaled,
site_frequency_spectrum_folded_scaled, plot_site_frequency_spectrum
"""
# check input
biallelic_ac = np.asarray(biallelic_ac)
assert biallelic_ac.ndim == 2
assert biallelic_ac.shape[1] == 2
# calculate minor allele counts
minor_ac = np.amin(biallelic_ac, axis=1)
# calculate frequency spectrum
sfs_folded = np.bincount(minor_ac)
return sfs_folded
[docs]def site_frequency_spectrum_scaled(derived_ac):
"""Calculate the site frequency spectrum, scaled such that a constant value
is expected across the spectrum for neutral variation and a population at
constant size.
Parameters
----------
derived_ac : array_like, int
A 1-dimensional array of shape (n_variants,) where each array
element holds the count of derived alleles found for a single variant
across some set of samples.
Returns
-------
sfs_scaled : ndarray, int
An array of integers where the value of the kth element is the
number of variant sites with k derived alleles, multiplied by k.
Notes
-----
Under neutrality and constant population size, site frequency
is expected to be constant across the spectrum, and to approximate
the value of the population-scaled mutation rate theta.
See Also
--------
site_frequency_spectrum, site_frequency_spectrum_folded,
site_frequency_spectrum_folded_scaled, plot_site_frequency_spectrum
"""
# calculate frequency spectrum
sfs = site_frequency_spectrum(derived_ac)
# scaling
k = np.arange(sfs.size)
sfs_scaled = sfs * k
return sfs_scaled
[docs]def site_frequency_spectrum_folded_scaled(biallelic_ac, m=None):
"""Calculate the folded site frequency spectrum, scaled such that a
constant value is expected across the spectrum for neutral variation and
a population at constant size.
Parameters
----------
biallelic_ac : array_like int
A 2-dimensional array of shape (n_variants, 2), where each row
holds the reference and alternate allele counts for a single
biallelic variant across some set of samples.
m : int, optional
The total number of alleles observed at each variant site. Equal to
the number of samples multiplied by the ploidy. If not provided,
will be inferred to be the maximum value of the sum of reference and
alternate allele counts present in `biallelic_ac`.
Returns
-------
sfs_folded_scaled : ndarray, int
An array of integers where the value of the kth element is the
number of variant sites with k observations of the minor allele,
multiplied by the scaling factor (k * (m - k) / m).
Notes
-----
Under neutrality and constant population size, site frequency
is expected to be constant across the spectrum, and to approximate
the value of the population-scaled mutation rate theta.
This function is useful where the ancestral and derived status of alleles
is unknown.
See Also
--------
site_frequency_spectrum, site_frequency_spectrum_scaled,
site_frequency_spectrum_folded, plot_site_frequency_spectrum
"""
# calculate the folded site frequency spectrum
sfs_folded = site_frequency_spectrum_folded(biallelic_ac)
# determine the total number of alleles per variant
if m is None:
m = np.amax(np.sum(biallelic_ac, axis=1))
# scaling
k = np.arange(sfs_folded.size)
sfs_folded_scaled = sfs_folded * k * (m - k) / m
return sfs_folded_scaled
[docs]def plot_site_frequency_spectrum(sfs, bins=None, m=None,
clip_endpoints=True, ax=None, label=None,
plot_kwargs=None):
"""Plot a site frequency spectrum.
Parameters
----------
sfs : array_like, int
Site frequency spectrum. Can be folded or unfolded, scaled or
unscaled.
bins : int or sequence of ints, optional
Number of bins or bin edges to aggregate frequencies. If not given,
no binning will be applied.
m : int, optional
The total number of alleles observed at each variant site. Equal to
the number of samples multiplied by the ploidy. If given, will be
used to scale the X axis as allele frequency instead of allele count.
used to scale the X axis as allele frequency instead of allele count.
clip_endpoints : bool, optional
If True, remove the first and last values from the site frequency
spectrum.
ax : axes, optional
The axes on which to plot. If not given, a new figure will be created.
label : string, optional
Label for this data series.
plot_kwargs : dict, optional
Passed through to ax.plot().
Returns
-------
ax : axes
The axes on which the plot was drawn.
See Also
--------
site_frequency_spectrum, site_frequency_spectrum_folded,
site_frequency_spectrum_scaled, site_frequency_spectrum_folded_scaled
"""
if ax is None:
fig, ax = plt.subplots()
if bins is None:
# no binning
if clip_endpoints:
x = np.arange(1, sfs.size-1)
y = sfs[1:-1]
else:
x = np.arange(sfs.size)
y = sfs
else:
# bin the frequencies
if clip_endpoints:
y, b, _ = scipy.stats.binned_statistic(np.arange(1, sfs.size-1),
values=sfs[1:-1],
bins=bins,
statistic='mean')
else:
y, b, _ = scipy.stats.binned_statistic(np.arange(sfs.size),
values=sfs,
bins=bins,
statistic='mean')
# use bin midpoints for plotting
x = (b[:-1] + b[1:]) / 2
if m is not None:
# convert allele counts to allele frequencies
x = x / m
ax.set_xlabel('allele frequency')
else:
ax.set_xlabel('allele count')
# plotting
if plot_kwargs is None:
plot_kwargs = dict()
ax.plot(x, y, label=label, **plot_kwargs)
# tidy up
ax.set_ylabel('site frequency')
return ax