Source code for anhima.ld

"""
Utilities for calculating and plotting linkage disequilbrium.

See also the examples at:

- http://nbviewer.ipython.org/github/alimanfoo/anhima/blob/master/examples/ld.ipynb

"""  # noqa


from __future__ import division, print_function, unicode_literals, \
    absolute_import


# third party dependencies
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy.stats as stats
import scipy.spatial.distance as distance


# internal dependencies
import anhima.loc
from anhima.opt.ld import ld_prune_pairwise_uint8 as _ld_prune_pairwise_uint8


[docs]def pairwise_genotype_ld(gn):
    """Given a set of genotypes at biallelic variants, calculate the
    square of the correlation coefficient between all distinct pairs
    of variants.

    Parameters
    ----------

    gn : array_like
        A 2-dimensional array of shape (`n_variants`, `n_samples`) where each
        element is a genotype call coded as a single integer counting the
        number of non-reference alleles.

    Returns
    -------

    r_squared : ndarray, float
        A 2-dimensional array of squared correlation coefficients between
        each pair of variants.

    """

    # check input array
    gn = np.asarray(gn)
    assert gn.ndim == 2

    # TODO deal with missing genotypes
    return np.corrcoef(gn) ** 2


[docs]def plot_pairwise_ld(r_squared, cmap='Greys', flip=True, ax=None):
    """Make a classic triangular linkage disequilibrium plot, given an
    array of pairwise correlation coefficients between variants.

    Parameters
    ----------

    r_squared : array_like
        A square 2-dimensional array of squared correlation coefficients
        between pairs of variants.
    cmap : color map, optional
        The color map to use when plotting. Defaults to 'Greys' (0=white,
        1=black).
    flip : bool, optional
        If True, draw the triangle upside down.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn

    """

    # check inputs
    r_squared = np.asarray(r_squared)
    assert r_squared.ndim == 2
    assert r_squared.shape[0] == r_squared.shape[1]

    # setup axes
    if ax is None:
        x = plt.rcParams['figure.figsize'][0]
        fig = plt.figure(figsize=(x, x//2))
        ax = fig.add_axes((0, 0, 1, 1))

    # define transformation to rotate the colormesh
    trans = mpl.transforms.Affine2D().rotate_deg_around(0, 0, -45)
    trans = trans + ax.transData

    # plot the data as a colormesh
    ax.pcolormesh(r_squared, cmap=cmap, vmin=0, vmax=1, transform=trans)

    # cut the plot in half so we see a triangle
    ax.set_ylim(bottom=0)

    # turn the triangle upside down
    if flip:
        ax.invert_yaxis()

    # remove axis lines
    ax.set_axis_off()

    return ax


[docs]def plot_windowed_ld(gn, pos, window_size, start_position=None,
                     stop_position=None, percentiles=(5, 95), ax=None,
                     median_plot_kwargs=None,
                     percentiles_plot_kwargs=None):
    """Plot average LD within non-overlapping genome windows.

    Parameters
    ----------

    gn : array_like
        A 2-dimensional array of shape (`n_variants`, `n_samples`) where each
        element is a genotype call coded as a single integer counting the
        number of non-reference alleles.
    pos : array_like
        A 1-dimensional array of genomic positions of variants.
    window_size : int
        The size in base-pairs of the windows.
    start_position : int, optional
        The start position for the region over which to work.
    stop_position : int, optional
        The stop position for the region over which to work.
    percentiles : sequence of integers, optional
        Percentiles to plot in addition to the median.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    median_plot_kwargs : dict, optional
        Keyword arguments to pass through when plotting the median line.
    percentiles_plot_kwargs : dict, optional
        Keyword arguments to pass through when plotting the percentiles.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # check input array
    gn = np.asarray(gn)
    assert gn.ndim == 2

    # set up axes
    if ax is None:
        x = plt.rcParams['figure.figsize'][0]
        fig = plt.figure(figsize=(x, x//3))
        ax = fig.add_subplot(111)

    # determine bins
    if stop_position is None:
        stop_position = np.max(pos)
    if start_position is None:
        start_position = np.min(pos)
    bin_edges = np.arange(start_position, stop_position, window_size)
    n_bins = len(bin_edges) - 1

    # initialise plotting variables
    med = np.zeros((n_bins,), dtype='f4')
    if percentiles:
        pc = np.zeros((n_bins, len(percentiles)), dtype='f4')

    # iterate over bins
    for n in range(n_bins):

        # determine bin start and stop positions
        bin_start = bin_edges[n]
        bin_stop = bin_edges[n + 1]

        # map genome positions onto variant indices
        loc = anhima.loc.locate_interval(pos, bin_start, bin_stop)

        if loc.stop - loc.start > 0:

            # view genotypes for the current region
            gw = gn[loc, :]

            # calculate pairwise LD
            r_squared = pairwise_genotype_ld(gw)

            # convert to non-redundance form
            r_squared_nonredundant = distance.squareform(r_squared,
                                                         checks=False)

            # calculate median
            med[n] = np.median(r_squared_nonredundant)

            # calculate percentiles
            if percentiles:
                for i, p in enumerate(percentiles):
                    pc[n, i] = np.percentile(r_squared_nonredundant, p)

    # determine x coordinates for plotting, as bin centers
    x = (bin_edges[1:] + bin_edges[:-1]) / 2

    # plot median
    if median_plot_kwargs is None:
        median_plot_kwargs = dict()
    median_plot_kwargs.setdefault('linestyle', '-')
    median_plot_kwargs.setdefault('color', 'k')
    median_plot_kwargs.setdefault('linewidth', 2)
    ax.plot(x, med, **median_plot_kwargs)

    # plot percentiles
    if percentiles:
        if percentiles_plot_kwargs is None:
            percentiles_plot_kwargs = dict()
        percentiles_plot_kwargs.setdefault('linestyle', '--')
        percentiles_plot_kwargs.setdefault('color', 'k')
        percentiles_plot_kwargs.setdefault('linewidth', 1)
        for i, p in enumerate(percentiles):
            ax.plot(x, pc[:, i], **percentiles_plot_kwargs)

    # tidy up
    ax.set_xlabel('position')
    ax.set_ylabel('$r^2$', rotation=0)
    ax.grid(axis='y')

    return ax


[docs]def ld_prune_pairwise(gn, window_size=100, window_step=10, max_r_squared=.2):
    """Given a set of genotypes at biallelic variants, find a subset
    of the variants which are in approximate linkage equilibrium with
    each other.

    Parameters
    ----------

    gn : array_like
        A 2-dimensional array of shape (`n_variants`, `n_samples`) where each
        element is a genotype call coded as a single integer counting the
        number of non-reference alleles.
    window_size : int, optional
        The number of variants to work with at a time.
    window_step : int, optional
        The number of variants to shift the window by.
    max_r_squared : float, optional
        The maximum value of the genotype correlation coefficient, above which
        variants will be excluded.

    Returns
    -------

    included : ndarray, bool
        A boolean array of the same length as the number of variants,
        where a True value indicates the variant at the corresponding
        index is included, and a False value indicates the corresponding
        variant is excluded.

    Notes
    -----

    The algorithm is as follows. A window of `window_size` variants is
    taken from the beginning of the genotypes array. The genotype
    correlation coefficient is calculated between each pair of
    variants in the window. The first variant in the window is
    considered, and any other variants in the window with linkage
    above `max_r_squared` with respect to the first variant is
    excluded. The next non-excluded variant in the window is then
    considered, and so on. The window then shifts along by
    `window_step` variants, and the process is repeated.

    """

    # check input array
    gn = np.asarray(gn).astype('u1')
    assert gn.ndim == 2

    # use optimised implementation
    included = _ld_prune_pairwise_uint8(gn, window_size, window_step,
                                        max_r_squared)
    return included


[docs]def pairwise_ld_decay(r_squared, pos, step=1):
    """Compile data on linkage disequilibrium, separation (in number
    of variants), and physical distance between pairs of variants.

    Parameters
    ----------

    r_squared : array_like
        A square 2-dimensional array of squared correlation coefficients
        between pairs of variants.
    pos : array_like
        A 1-dimensional array of genomic positions of variants.
    step : int, optional
        When compiling the data, advance `step` variants.

    Returns
    -------

    cor : ndarray, float
        Each element in the array is the squared genotype correlation
        coefficient between a distinct pair of variants.
    sep : ndarray, int
        Each element in the array is the separation (in number of variants)
        between a distinct pair of variants.
    dist : ndarray, int
        Each element in the array is the physical distance between a distinct
        pair of variants.

    See Also
    --------

    windowed_ld_decay

    """

    # check inputs
    r_squared = np.asarray(r_squared)
    assert r_squared.ndim == 2
    assert r_squared.shape[0] == r_squared.shape[1]

    # determine the number of variants
    n_variants = r_squared.shape[0]

    # determine pairs of variants to use
    pairs = [(i, j)
             for i in range(0, n_variants, step)
             for j in range(i+1, n_variants)]

    # initialise output arrays
    cor = np.zeros((len(pairs),), dtype=np.float)
    sep = np.zeros((len(pairs),), dtype=np.int)
    dist = np.zeros((len(pairs),), dtype=np.int)

    # iterate over pairs
    for n, (i, j) in enumerate(pairs):
        cor[n] = r_squared[i, j]
        sep[n] = j - i
        dist[n] = np.abs(pos[j] - pos[i])

    return cor, sep, dist


[docs]def windowed_ld_decay(gn, pos, window_size, step=1):
    """Compile data on linkage disequilibrium, separation (in number
    of variants), and physical distance between pairs of variants.

    Parameters
    ----------

    gn : array_like
        A 2-dimensional array of shape (`n_variants`, `n_samples`) where each
        element is a genotype call coded as a single integer counting the
        number of non-reference alleles.
    pos : array_like
        A 1-dimensional array of genomic positions of variants.
    window_size : int, optional
        The number of variants to work with at a time.
    step : int, optional
        When compiling the data within each window, advance `step` variants.

    Returns
    -------

    cor : ndarray, float
        Each element in the array is the squared genotype correlation
        coefficient between a distinct pair of variants.
    sep : ndarray, int
        Each element in the array is the separation (in number of variants)
        between a distinct pair of variants.
    dist : ndarray, int
        Each element in the array is the physical distance between a distinct
        pair of variants.

    See Also
    --------

    pairwise_ld_decay

    Notes
    -----

    Similar to :func:`pairwise_ld_decay` except that not all pairs of
    variants are sampled to speed up computation and use less memory. Variants
    are divided into non-overlapping windows of size `window_size`. Genotype LD
    is calculated for all pairs within each window.

    """

    # check input array
    gn = np.asarray(gn)
    assert gn.ndim == 2

    # determine number of variants
    n_variants = gn.shape[0]

    # initialise output variables
    all_cor = list()
    all_sep = list()
    all_dist = list()

    # iterate over non-overlapping windows of variants
    for window_start in range(0, n_variants, window_size):

        # determine extent of current window
        window_stop = min(window_start + window_size, n_variants)

        # view genotypes for the current window
        gw = gn[window_start:window_stop, :]

        # calculate LD
        r_squared = pairwise_genotype_ld(gw)

        # compile data
        cor, sep, dist = pairwise_ld_decay(r_squared, pos, step=step)
        all_cor.append(cor)
        all_sep.append(sep)
        all_dist.append(dist)

    # concatenate results from each window
    all_cor = np.concatenate(all_cor)
    all_sep = np.concatenate(all_sep)
    all_dist = np.concatenate(all_dist)

    return all_cor, all_sep, all_dist


[docs]def plot_ld_decay_by_separation(cor, sep,
                                max_separation=100,
                                percentiles=(5, 95),
                                ax=None,
                                median_plot_kwargs=None,
                                percentiles_plot_kwargs=None):
    """Plot the decay of linkage disequilibrium with separation
    between variants.

    Parameters
    ----------

    cor : array_like
        A 1-dimensional array of squared correlation coefficients between
        pairs of variants.
    sep : array_like
        A 1-dimensional array of separations (in number of variants) between
        pairs of variants.
    max_separation : int, optional
        Maximum separation to consider.
    percentiles : sequence of integers, optional
        Percentiles to plot in addition to the median.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    median_plot_kwargs : dict, optional
        Keyword arguments to pass through when plotting the median line.
    percentiles_plot_kwargs : dict, optional
        Keyword arguments to pass through when plotting the percentiles.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # check inputs
    cor = np.asarray(cor)
    sep = np.asarray(sep)
    assert cor.ndim == 1
    assert sep.ndim == 1
    assert cor.shape[0] == sep.shape[0]

    # set up axes
    if ax is None:
        fig, ax = plt.subplots()

    # set up arrays for plotting
    cor_median = np.zeros((max_separation,), dtype='f4')
    if percentiles:
        cor_percentiles = np.zeros((max_separation, len(percentiles)),
                                   dtype='f4')

    # iterate over separations, compiling data
    for i in range(max_separation):

        # view correlations at the given separation
        c = cor[sep == i]

        # calculate median and percentiles
        if len(c) > 0:
            cor_median[i] = np.median(c)
            if percentiles:
                for n, p in enumerate(percentiles):
                    cor_percentiles[i, n] = np.percentile(c, p)

    # plot the median
    x = range(max_separation)
    y = cor_median
    if median_plot_kwargs is None:
        median_plot_kwargs = dict()
    median_plot_kwargs.setdefault('linestyle', '-')
    median_plot_kwargs.setdefault('color', 'k')
    median_plot_kwargs.setdefault('linewidth', 2)
    plt.plot(x, y, label='median', **median_plot_kwargs)

    # plot percentiles
    if percentiles:
        if percentiles_plot_kwargs is None:
            percentiles_plot_kwargs = dict()
        percentiles_plot_kwargs.setdefault('linestyle', '--')
        percentiles_plot_kwargs.setdefault('color', 'k')
        percentiles_plot_kwargs.setdefault('linewidth', 1)
        for n, p in enumerate(percentiles):
            y = cor_percentiles[:, n]
            plt.plot(x, y, label='%s%%' % p, **percentiles_plot_kwargs)

    # tidy up
    ax.set_xlim(left=1, right=max_separation)
    ax.set_ylim(0, 1)
    ax.set_xlabel('separation')
    ax.set_ylabel('$r^2$', rotation=0)
    ax.grid(axis='y')

    return ax


[docs]def plot_ld_decay_by_distance(cor, dist, bins,
                              percentiles=(5, 95),
                              ax=None,
                              median_plot_kwargs=None,
                              percentiles_plot_kwargs=None):
    """Plot the decay of linkage disequilibrium with physical distance
    between variants.

    Parameters
    ----------

    cor : array_like
        A 1-dimensional array of squared correlation coefficients between
        pairs of variants.
    dist : array_like
        A 1-dimensional array of physical distances between pairs of variants.
    bins : int or sequence of ints
        Number of bins or bin edges. Bins of distance to calculate LD within.
    percentiles : sequence of integers, optional
        Percentiles to plot in addition to the median.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    median_plot_kwargs : dict, optional
        Keyword arguments to pass through when plotting the median line.
    percentiles_plot_kwargs : dict, optional
        Keyword arguments to pass through when plotting the percentiles.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # check inputs
    cor = np.asarray(cor)
    dist = np.asarray(dist)
    assert cor.ndim == 1
    assert dist.ndim == 1
    assert cor.shape[0] == dist.shape[0]

    # set up axes
    if ax is None:
        fig, ax = plt.subplots()

    # calculate the median of correlation values within bins
    y, bin_edges, _ = stats.binned_statistic(dist, values=cor, bins=bins,
                                             statistic=np.median)

    # determine x axis variable as bin centers
    x = (bin_edges[:-1] + bin_edges[1:]) / 2

    # plot median
    if median_plot_kwargs is None:
        median_plot_kwargs = dict()
    median_plot_kwargs.setdefault('linestyle', '-')
    median_plot_kwargs.setdefault('color', 'k')
    median_plot_kwargs.setdefault('linewidth', 2)
    ax.plot(x, y, label='median', **median_plot_kwargs)

    # calculate and plot percentiles
    if percentiles:
        if percentiles_plot_kwargs is None:
            percentiles_plot_kwargs = dict()
        percentiles_plot_kwargs.setdefault('linestyle', '--')
        percentiles_plot_kwargs.setdefault('color', 'k')
        percentiles_plot_kwargs.setdefault('linewidth', 1)
        for p in percentiles:
            y, bin_edges, _ = stats.binned_statistic(
                dist,
                values=cor,
                bins=bins,
                statistic=lambda v: np.percentile(v, p)
            )
            ax.plot(x, y, label='%s%%' % p, **percentiles_plot_kwargs)

    # tidy up
    ax.set_xlim(np.min(x), np.max(x))
    ax.set_ylim(0, 1)
    ax.set_xlabel('distance')
    ax.set_ylabel('$r^2$', rotation=0)
    ax.grid(axis='y')

    return ax