Source code for anhima.f2

"""
Doubleton sharing, a.k.a., analysis of f2 variants.

See also the examples at:

- http://nbviewer.ipython.org/github/alimanfoo/anhima/blob/master/examples/f2.ipynb

"""  # noqa


from __future__ import division, print_function, unicode_literals, \
    absolute_import


# standard library dependencies
import itertools


# third party dependencies
import numpy as np
import matplotlib.pyplot as plt
import scipy.special


[docs]def count_shared_doubletons(subpops_ac): """Count subpopulation pairs sharing doubletons (where one allele is observed in each subpopulation). Parameters ---------- subpops_ac : array_like, int An array of shape (n_variants, n_subpops) holding alternate allele counts for each subpopulation. Returns ------- counts : ndarray, int or float A square matrix of shape (n_subpops, n_subpops) where the array element at index (i, j) holds the count of shared doubletons between the ith and jth subpopulations. See Also -------- normalise_doubleton_counts, plot_shared_doubletons, plot_total_doubletons, plot_f2_fig """ # check input subpops_ac = np.asarray(subpops_ac) assert subpops_ac.ndim == 2 # find doubletons in the total population is_total_doubleton = np.sum(subpops_ac, axis=1) == 2 subpops_ac_doubletons = np.compress(is_total_doubleton, subpops_ac, axis=0) # count subpopulaton pairs sharing doubletons n_subpops = subpops_ac.shape[1] counts = np.zeros((n_subpops, n_subpops), dtype=np.int) for i in range(n_subpops): for j in range(i, n_subpops): if i == j: # count cases where doubleton is private to a subpopulation n = np.count_nonzero(subpops_ac_doubletons[:, i] == 2) else: # count cases where doubleton is shared between two # subpopulations n = np.count_nonzero((subpops_ac_doubletons[:, i] == 1) & (subpops_ac_doubletons[:, j] == 1)) # fill both upper and lower triangles counts[i, j] = n counts[j, i] = n return counts
[docs]def normalise_doubleton_counts(counts, n_samples, ploidy=2): """Normalise doubleton counts by dividing by the number of distinct pairs of haplotypes in each population comparison. Parameters ---------- counts : array_like, ints A square matrix of shape (n_subpops, n_subpops) where the array element at index (i, j) holds the count of shared doubletons between the ith and jth subpopulations. n_samples : int or sequence of ints The number of samples in each sub-population. ploidy : int, optional The sample ploidy. Returns ------- normed_counts : ndarray, float Normalised counts of shared doubletons. See Also -------- count_shared_doubletons Notes ----- This function corrects for the fact that there are fewer pairs of haplotypes when looking for doubletons within a single subpopulation of size n than there are when comparing two different subpopulations of size n. This function may also help to correct for the case where the number of samples from each subpopulation is not equal. However, note that if this is the case then there may still also be some bias in how doubletons have been ascertained. """ # check inputs counts = np.asarray(counts) assert counts.ndim == 2 assert counts.shape[0] == counts.shape[1] n_subpops = counts.shape[0] # deal with polymorphic input if isinstance(n_samples, int): n_samples = [n_samples] * n_subpops n_samples = np.asarray(n_samples) # normalise counts normed_counts = np.zeros((n_subpops, n_subpops), dtype=np.float) for i in range(n_subpops): for j in range(i, n_subpops): if i == j: # number of distinct pairs of haplotypes within a # subpopulation = number of haplotypes choose 2 n_haplotypes = n_samples[i] * ploidy n_pairs = scipy.special.comb(n_haplotypes, 2) else: # number of distinct pairs of haplotypes between # subpopulations n_pairs = (n_samples[i] * ploidy) * (n_samples[j] * ploidy) # fill upper and lower triangles normed_counts[i, j] = counts[i, j] / n_pairs normed_counts[j, i] = counts[j, i] / n_pairs return normed_counts
[docs]def plot_shared_doubletons(counts, subpop_labels=None, subpop_colors='bgrcmyk', axs=None, figsize_factor=1, ylim=None, relative=False, flip=False): """Plot counts of doubleton sharing between subpopulations as a bar chart. Parameters ---------- counts : array_like, ints A square matrix of shape (n_subpops, n_subpops) where the array element at index (i, j) holds the count of shared doubletons between the ith and jth subpopulations. subpop_labels : sequence of strings, optional Labels for the subpopulations. subpop_colors : sequence of colors, optional Colors for the subpopulations. axs : sequence of axes, optional The axes to use. If not provided, a new figure will be created. figsize_factor : float, optional Figure size in inches per subpopulation. Only used if `axs` is None. ylim : pair of ints or floats, optional Limits for the Y axes of all subplots. relative : bool, optional If True, normalise counts by dividing by the sum along each row. flip : bool, optional If True, invert the Y axis. Returns ------- axs : sequence of axes The axes on which the plot was drawn. See Also -------- count_shared_doubletons, plot_total_doubletons, plot_f2_fig """ # check inputs counts = np.asarray(counts) assert counts.ndim == 2 assert counts.shape[0] == counts.shape[1] n_subpops = counts.shape[0] # setup axes if axs is None: x = n_subpops * figsize_factor fig = plt.figure(figsize=(x, x)) axs = [fig.add_subplot(n_subpops, 1, i+1) for i in range(n_subpops)] # ensure we have enough colors colors = list(itertools.islice(itertools.cycle(subpop_colors), n_subpops)) # ensure we have subpopulation labels if subpop_labels is None: subpop_labels = range(n_subpops) # normalise by row if relative: # compute sum along each row row_sum = np.sum(counts, axis=1) # make sure we get the broadcasting right counts = counts / row_sum[:, np.newaxis] # determine global Y limits if ylim is None: ylim = (0, np.amax(counts)) # plot main bar for i, color in zip(range(n_subpops), colors): if flip: # plot from the top down ax = axs[i] else: # plot from the bottom up ax = axs[n_subpops-i-1] # select data to plot data = counts[i, :] # make a bar ax.bar(range(n_subpops), data, width=1, color=colors) # tidy up ax.set_ylim(*ylim) ax.set_ylabel(subpop_labels[i], rotation=0, ha='right', va='center', color=color) ax.tick_params(length=0) ax.set_yticks([]) if (flip and i > 0) or (not flip and i < n_subpops - 1): ax.set_xticks([]) else: ax.xaxis.tick_top() ax.set_xticks(np.arange(n_subpops) + .5) ax.set_xticklabels(subpop_labels, rotation=90) for s in 'top', 'left', 'right': ax.spines[s].set_visible(False) return axs
[docs]def plot_total_doubletons(counts, subpop_labels=None, width=.8, orientation='vertical', n_samples=None, ax=None, bar_kwargs=None): """Plot total counts of doubletons per subpopulations as a bar chart. Parameters ---------- counts : array_like, ints A square matrix of shape (n_subpops, n_subpops) where the array element at index (i, j) holds the count of shared doubletons between the ith and jth subpopulations. subpop_labels : sequence of strings, optional Labels for the subpopulations. width : float, optional The relative width of each bar. orientation : {'vertical', 'horizontal'} The bar orientation. n_samples : int or sequence of ints The number of samples in each sub-population. ax : axes, optional The axes on which to plot. If not provided, a new figure will be created. bar_kwargs : dict, optional Keyword arguments passed through to ax.bar(). Returns ------- ax : axes The axes on which the plot was drawn. See Also -------- count_shared_doubletons, plot_shared_doubletons, plot_total_doubletons, plot_f2_fig """ # check inputs counts = np.asarray(counts) assert counts.ndim == 2 assert counts.shape[0] == counts.shape[1] n_subpops = counts.shape[0] # setup axes if ax is None: fig, ax = plt.subplots() # sum rows y = np.sum(counts, axis=1) # normalise total counts by number of samples if n_samples is not None: # deal with polymorphic input if isinstance(n_samples, int): n_samples = [n_samples] * n_subpops n_samples = np.asarray(n_samples) y = y / n_samples # plot bar x = np.arange(n_subpops) + .5 if bar_kwargs is None: bar_kwargs = dict() bar_kwargs.setdefault('color', 'gray') bar_kwargs.setdefault('align', 'center') bar_kwargs.setdefault('linewidth', 0) if orientation == 'vertical': ax.bar(x, y, width, **bar_kwargs) else: ax.barh(x, y, width, **bar_kwargs) # ensure we have subpopulation labels if subpop_labels is None: subpop_labels = range(n_subpops) # tidy up ax.tick_params(length=0) if orientation == 'vertical': for s in 'top', 'right', 'left': ax.spines[s].set_visible(False) ax.set_xticks(range(n_subpops)) ax.set_xticklabels(subpop_labels, rotation=90) ax.set_yticks([]) ax.xaxis.tick_bottom() ax.set_ylabel('doubletons') ax.set_xlim(0, n_subpops) else: for s in 'top', 'right', 'bottom': ax.spines[s].set_visible(False) ax.set_yticks(range(n_subpops)) ax.set_yticklabels(subpop_labels, rotation=0) ax.set_xticks([]) ax.yaxis.tick_left() ax.set_xlabel('doubletons') ax.xaxis.set_label_position('top') ax.set_ylim(0, n_subpops) return ax
[docs]def plot_f2_fig(counts, subpop_labels=None, subpop_colors='bgrcmyk', fig=None, figsize_factor=1, relative=False, normed=False, n_samples=None, ploidy=2): """Plot a combined figure of shared doubleton counts and total counts per subpopulation. Parameters ---------- counts : array_like, ints A square matrix of shape (n_subpops, n_subpops) where the array element at index (i, j) holds the count of shared doubletons between the ith and jth subpopulations. subpop_labels : sequence of strings, optional Labels for the subpopulations. subpop_colors : sequence of colors, optional Colors for the subpopulations. fig : figure, optional The figure to use. If not provided, a new figure will be created. figsize_factor : float, optional Figure size in inches per subpopulation. Only used if `fig` is None. relative : bool, optional If True, plot counts relative to the sum along each row. normed : bool, optional If True, normalise counts by dividing by the number of possible pairs of haplotypes. n_samples : int or sequence of ints The number of samples in each sub-population. ploidy : int, optional The sample ploidy. (Only relevant if `normed` is True.) Returns ------- fig : figure The figure on which the plot was drawn. See Also -------- count_shared_doubletons, plot_shared_doubletons, plot_total_doubletons """ # check inputs counts = np.asarray(counts) assert counts.ndim == 2 assert counts.shape[0] == counts.shape[1] n_subpops = counts.shape[0] # setup figure if fig is None: width = (n_subpops + 1) * figsize_factor height = n_subpops * figsize_factor fig = plt.figure(figsize=(width, height)) # plot main bar main_axs = [plt.subplot2grid((n_subpops, n_subpops+1), (i, 0), rowspan=1, colspan=n_subpops) for i in range(n_subpops)] if normed: main_counts = normalise_doubleton_counts(counts, n_samples=n_samples, ploidy=ploidy) else: main_counts = counts plot_shared_doubletons(main_counts, subpop_labels=subpop_labels, subpop_colors=subpop_colors, relative=relative, axs=main_axs) # plot totals bar tot_ax = plt.subplot2grid((n_subpops, n_subpops+1), (0, n_subpops), rowspan=n_subpops, colspan=1) plot_total_doubletons(counts, subpop_labels=subpop_labels, n_samples=n_samples, orientation='horizontal', ax=tot_ax) tot_ax.set_yticks([]) return fig