Source code for anhima.gt

"""
Utility functions for working with genotype data.

See also the examples at:

- http://nbviewer.ipython.org/github/alimanfoo/anhima/blob/master/examples/gt.ipynb

"""  # noqa


from __future__ import division, print_function, unicode_literals, \
    absolute_import


# third party dependencies
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import numexpr as ne


# internal dependencies
import anhima.loc


[docs]def is_called(genotypes): """Find non-missing genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- is_called : ndarray, bool An array where elements are True if the genotype call is non-missing. See Also -------- is_missing, is_hom_ref, is_het, is_hom_alt Notes ----- Applicable to polyploid genotype calls. Applicable to multiallelic variants. """ # check input array has 2 or more dimensions genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 # assume ploidy is fastest changing dimension dim_ploidy = genotypes.ndim - 1 ploidy = genotypes.shape[dim_ploidy] assert ploidy > 1 # optimisation for diploid case if ploidy == 2: allele1 = genotypes[..., 0] # noqa allele2 = genotypes[..., 1] # noqa ex = '(allele1 >= 0) & (allele2 >= 0)' out = ne.evaluate(ex) # general ploidy case else: out = np.all(genotypes >= 0, axis=dim_ploidy) return out
[docs]def count_called(genotypes, axis=None): """Count non-missing genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). axis : int, optional The axis along which to count (0 = variants, 1 = samples). Returns ------- n : int or array If `axis` is None, returns the number of called (i.e., non-missing) genotypes. If `axis` is specified, returns the sum along the given `axis`. See Also -------- is_called """ # count genotypes n = np.sum(is_called(genotypes), axis=axis) return n
[docs]def is_missing(genotypes): """Find missing genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- is_missing: ndarray, bool An array where elements are True if the genotype call is missing. See Also -------- is_called, is_hom_ref, is_het, is_hom_alt Notes ----- Applicable to polyploid genotype calls. Applicable to multiallelic variants. """ # check input array has 2 or more dimensions genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 # assume ploidy is fastest changing dimension dim_ploidy = genotypes.ndim - 1 ploidy = genotypes.shape[dim_ploidy] assert ploidy > 1 # optimisation for diploid case if ploidy == 2: allele1 = genotypes[..., 0] # noqa allele2 = genotypes[..., 1] # noqa ex = '(allele1 < 0) | (allele2 < 0)' out = ne.evaluate(ex) # general ploidy case else: out = np.any(genotypes < 0, axis=dim_ploidy) return out
[docs]def count_missing(genotypes, axis=None): """Count non-missing genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). axis : int, optional The axis along which to count (0 = variants, 1 = samples). Returns ------- n : int or array If `axis` is None, returns the number of missing genotypes. If `axis` is specified, returns the sum along the given `axis`. See Also -------- is_missing """ # count genotypes n = np.sum(is_missing(genotypes), axis=axis) return n
[docs]def is_hom(genotypes): """Find homozygous genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- is_hom : ndarray, bool An array where elements are True if the genotype call is homozygous. See Also -------- is_called, is_missing, is_hom_ref, is_hom_alt Notes ----- Applicable to polyploid genotype calls. Applicable to multiallelic variants. """ # check input array has 2 or more dimensions genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 # assume ploidy is fastest changing dimension dim_ploidy = genotypes.ndim - 1 ploidy = genotypes.shape[dim_ploidy] assert ploidy > 1 # optimisation for diploid case if ploidy == 2: allele1 = genotypes[..., 0] # noqa allele2 = genotypes[..., 1] # noqa ex = '(allele1 >= 0) & (allele1 == allele2)' out = ne.evaluate(ex) # general ploidy case else: allele1 = genotypes[..., 0, np.newaxis] # noqa other_alleles = genotypes[..., 1:] # noqa ex = '(allele1 >= 0) & (allele1 == other_alleles)' out = np.all(ne.evaluate(ex), axis=dim_ploidy) return out
[docs]def count_hom(genotypes, axis=None): """Count homozygous genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). axis : int, optional The axis along which to count (0 = variants, 1 = samples). Returns ------- n : int or array If `axis` is None, returns the number of homozygous genotypes. If `axis` is specified, returns the sum along the given `axis`. See Also -------- is_hom """ # count genotypes n = np.sum(is_hom(genotypes), axis=axis) return n
[docs]def is_het(genotypes): """Find heterozygous genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- is_het : ndarray, bool An array where elements are True if the genotype call is heterozygous. See Also -------- is_called, is_missing, is_hom_ref, is_hom_alt Notes ----- Applicable to polyploid genotype calls, although note that all types of heterozygous genotype (i.e., anything not completely homozygous) will give an element value of True. Applicable to multiallelic variants, although note that the element value will be True in any case where the two alleles in a genotype are different, e.g., (0, 1), (0, 2), (1, 2), etc. """ # check input array has 2 or more dimensions genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 # assume ploidy is fastest changing dimension dim_ploidy = genotypes.ndim - 1 ploidy = genotypes.shape[dim_ploidy] assert ploidy > 1 # optimisation for diploid case if ploidy == 2: allele1 = genotypes[..., 0] allele2 = genotypes[..., 1] out = allele1 != allele2 # general ploidy case else: allele1 = genotypes[..., 0, np.newaxis] other_alleles = genotypes[..., 1:] out = np.any(allele1 != other_alleles, axis=dim_ploidy) return out
[docs]def count_het(genotypes, axis=None): """Count heterozygous genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). axis : int, optional The axis along which to count (0 = variants, 1 = samples). Returns ------- n : int or array If `axis` is None, returns the number of heterozygous genotypes. If `axis` is specified, returns the sum along the given `axis`. See Also -------- is_het """ # count genotypes n = np.sum(is_het(genotypes), axis=axis) return n
[docs]def is_hom_ref(genotypes): """Find homozygous reference genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- is_hom_ref : ndarray, bool An array where elements are True if the genotype call is homozygous reference. See Also -------- is_called, is_missing, is_het, is_hom_alt Notes ----- Applicable to polyploid genotype calls. Applicable to multiallelic variants. """ # check input array has 2 or more dimensions genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 # assume ploidy is fastest changing dimension dim_ploidy = genotypes.ndim - 1 ploidy = genotypes.shape[dim_ploidy] assert ploidy > 1 # optimisation for diploid case if ploidy == 2: allele1 = genotypes[..., 0] # noqa allele2 = genotypes[..., 1] # noqa ex = '(allele1 == 0) & (allele2 == 0)' out = ne.evaluate(ex) # general ploidy case else: out = np.all(genotypes == 0, axis=dim_ploidy) return out
[docs]def count_hom_ref(genotypes, axis=None): """Count homozygous reference genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). axis : int, optional The axis along which to count (0 = variants, 1 = samples). Returns ------- n : int or array If `axis` is None, returns the number of homozygous reference genotypes. If `axis` is specified, returns the sum along the given `axis`. See Also -------- is_hom_ref """ # count genotypes n = np.sum(is_hom_ref(genotypes), axis=axis) return n
[docs]def is_hom_alt(genotypes): """Find homozygous non-reference genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- is_hom_alt : ndarray, bool An array where elements are True if the genotype call is homozygous non-reference. See Also -------- is_called, is_missing, is_hom_ref, is_het Notes ----- Applicable to polyploid genotype calls. Applicable to multiallelic variants. """ # check input array has 2 or more dimensions genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 # assume ploidy is fastest changing dimension dim_ploidy = genotypes.ndim - 1 ploidy = genotypes.shape[dim_ploidy] assert ploidy > 1 # optimisation for diploid case if ploidy == 2: allele1 = genotypes[..., 0] # noqa allele2 = genotypes[..., 1] # noqa ex = '(allele1 > 0) & (allele1 == allele2)' out = ne.evaluate(ex) # general ploidy case else: allele1 = genotypes[..., 0, np.newaxis] # noqa other_alleles = genotypes[..., 1:] # noqa ex = '(allele1 > 0) & (allele1 == other_alleles)' out = np.all(ne.evaluate(ex), axis=dim_ploidy) return out
[docs]def count_hom_alt(genotypes, axis=None): """Count homozygous non-reference genotype calls. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). axis : int, optional The axis along which to count (0 = variants, 1 = samples). Returns ------- n : int or array If `axis` is None, returns the number of homozygous non-reference genotypes. If `axis` is specified, returns the sum along the given `axis`. See Also -------- is_hom_alt """ # count genotypes n = np.sum(is_hom_alt(genotypes), axis=axis) return n
[docs]def max_allele(genotypes, axis=None): """ Return the highest allele index. Parameters ---------- genotypes : array_like An array of shape (n_variants, n_samples, ploidy) where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). axis : int, optional The axis along which to determine the maximum (0 = variants, 1 = samples). If not given, return the highest overall. Returns ------- n : int The value of the highest allele index present in the genotypes array. """ return np.amax(genotypes, axis=axis)
[docs]def as_haplotypes(genotypes): """Reshape an array of genotypes to view it as haplotypes by dropping the ploidy dimension. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- haplotypes : ndarray An array of shape (n_variants, n_samples * ploidy). Notes ----- Note that if genotype calls are unphased, the haplotypes returned by this function will bear no resemblance to the true haplotypes. Applicable to polyploid genotype calls. Applicable to multiallelic variants. """ # check input array genotypes = np.asarray(genotypes) assert genotypes.ndim == 3 # reshape, preserving size of first dimension newshape = (genotypes.shape[0], -1) haplotypes = np.reshape(genotypes, newshape) return haplotypes
[docs]def as_n_alt(genotypes): """Transform genotypes as the number of non-reference alleles. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- gn : ndarray, uint8 An array where each genotype is coded as a single integer counting the number of alternate alleles. See Also -------- as_012 Notes ----- Applicable to polyploid genotype calls. Applicable to multiallelic variants, although this function simply counts the number of non-reference alleles, it makes no distinction between different non-reference alleles. Note that this function returns 0 for missing genotype calls **and** for homozygous reference genotype calls, because in both cases the number of non-reference alleles is zero. """ # check input array genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 # assume ploidy is fastest changing dimension dim_ploidy = genotypes.ndim - 1 # count number of alternate alleles gn = np.empty(genotypes.shape[:-1], dtype='u1') np.sum(genotypes > 0, axis=dim_ploidy, out=gn) return gn
[docs]def as_012(genotypes, fill=-1): """Transform genotypes recoding homozygous reference calls a 0, heterozygous calls as 1, homozygous non-reference calls as 2, and missing calls as -1. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). fill : int, optional Default value for missing calls. Returns ------- gn : ndarray, int8 An array where each genotype is coded as a single integer as described above. See Also -------- as_nalt Notes ----- Applicable to polyploid genotype calls, although note that all types of heterozygous genotype (i.e., anything not completely homozygous) will be coded as 1. Applicable to multiallelic variants, although note the following. All heterozygous genotypes, e.g., (0, 1), (0, 2), (1, 2), ..., will be coded as 1. All homozygous non-reference genotypes, e.g., (1, 1), (2, 2), ..., will be coded as 2. """ # check input array genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 # set up output array gn = np.empty(genotypes.shape[:-1], dtype='i1') gn.fill(fill) # determine genotypes gn[is_hom_ref(genotypes)] = 0 gn[is_het(genotypes)] = 1 gn[is_hom_alt(genotypes)] = 2 return gn
[docs]def as_allele_counts(genotypes, alleles=None): """Transform genotypes into allele counts per call. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). alleles : sequence of ints, optional The alleles to count. If not specified, all alleles will be counted. Returns ------- gac : ndarray, uint8 An array where the ploidy dimension has been replaced by counts of each allele. """ # check inputs genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 dim_ploidy = genotypes.ndim - 1 if alleles is None: m = np.amax(genotypes) alleles = range(m+1) # count alleles along ploidy dimension gac = np.zeros(genotypes.shape[:-1] + (len(alleles),), dtype='u1') for i, allele in enumerate(alleles): np.sum(genotypes == allele, axis=dim_ploidy, out=gac[..., i]) return gac
[docs]def pack_diploid(genotypes): """ Pack diploid genotypes into a single byte for each genotype, using the left-most 4 bits for the first allele and the right-most 4 bits for the second allele. Allows single byte encoding of diploid genotypes for variants with up to 15 alleles. Parameters ---------- genotypes : array_like, int An array of shape (n_variants, n_samples, ploidy) or (n_variants, ploidy) or (n_samples, ploidy), where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). Returns ------- packed : ndarray, int8 An array of genotypes where the ploidy dimension has been collapsed by bit packing the two alleles for each genotype into a single byte. See Also -------- unpack_diploid_genotypes """ # check inputs genotypes = np.asarray(genotypes) assert genotypes.ndim > 1 assert np.amax(genotypes) < 15, 'max allele is 14' assert np.amin(genotypes) > -2, 'min allele is -1' genotypes = genotypes.astype('u1') # add 1 to handle missing alleles coded as -1 genotypes = genotypes + 1 # left shift first allele by 4 bits a1 = np.left_shift(genotypes[..., 0], 4) # mask left-most 4 bits to ensure second allele doesn't clash with first # allele a2 = np.bitwise_and(genotypes[..., 1], 15) # pack them packed = np.bitwise_or(a1, a2) # rotate round so that hom ref calls are encoded as 0, better for sparse # matrices packed -= 17 return packed
[docs]def unpack_diploid(packed): """ Unpack an array of diploid genotypes that have been bit packed into single bytes. Parameters ---------- packed : array_like An array of genotypes where the ploidy dimension has been collapsed by bit packing the two alleles for each genotype into a single byte. Returns ------- genotypes : ndarray, int8 An array of genotypes where the ploidy dimension has been restored by unpacking the input array. See Also -------- pack_diploid_genotypes """ # check inputs packed = np.asarray(packed).astype('u1') assert 1 <= packed.ndim <= 2 # rotate back round so missing calls are encoded as 0 packed = packed + 17 # set up output array genotypes = np.empty(packed.shape + (2,), dtype='u1') a1 = genotypes[..., 0] a2 = genotypes[..., 1] # right shift 4 bits to extract first allele np.right_shift(packed, 4, out=a1) # mask left-most 4 bits to extract second allele np.bitwise_and(packed, 15, out=a2) # subtract 1 to restore coding of missing alleles as -1 genotypes -= 1 genotypes = genotypes.astype('i1') return genotypes # packed representation of some common diploid genotypes
BHOM00 = 0 BHET01 = 1 BHOM11 = 17 BMISSING = 239
[docs]def count_genotypes(gn, t, axis=None): """Count genotypes of a given type. Parameters ---------- gn : array_like, int An array of shape (n_variants, n_samples) or (n_variants,) or (n_samples,) where each element is a genotype called coded as a single integer. t : int The genotype to count. axis : int, optional The axis along which to count (0 = variants, 1 = samples). Returns ------- n : int or array If `axis` is None, returns the total number of matching genotypes. If `axis` is specified, returns the sum along the given `axis`. """ # normalise inputs gn = np.asarray(gn) # count genotypes n = np.sum(gn == t, axis=axis) return n
[docs]def windowed_genotype_counts(pos, gn, t, window_size, start_position=None, stop_position=None): """Count genotype calls of a given type for a single sample in non-overlapping windows over the genome. Parameters ---------- pos : array_like, int A sorted 1-dimensional array of genomic positions from a single chromosome/contig. gn : array_like, int A 1-D array of genotypes for a single sample, where each genotype is coded as a single integer. t : int The genotype to count. window_size : int The size in base-pairs of the windows. start_position : int, optional The start position for the region over which to work. stop_position : int, optional The stop position for the region over which to work. Returns ------- counts : ndarray, int Genotype counts for each window. bin_edges : ndarray, float The edges of the windows. See Also -------- as_diploid_012, as_n_alt, windowed_genotype_density, windowed_genotype_rate """ # check input array gn = np.asarray(gn) assert gn.ndim == 1 # find matching genotypes values = gn == t # computed binned statistic counts, bin_edges = anhima.loc.windowed_statistic( pos, values=values, statistic=b'sum', window_size=window_size, start_position=start_position, stop_position=stop_position ) return counts, bin_edges
[docs]def windowed_genotype_density(pos, gn, t, window_size, start_position=None, stop_position=None): """Compute per-base-pair density of genotype calls of a given type for a single sample in non-overlapping windows over the genome. Parameters ---------- pos : array_like, int A sorted 1-dimensional array of genomic positions from a single chromosome/contig. gn : array_like, int A 1-D array of genotypes for a single sample, where each genotype is coded as a single integer. t : int The genotype to count. window_size : int The size in base-pairs of the windows. start_position : int, optional The start position for the region over which to work. stop_position : int, optional The stop position for the region over which to work. Returns ------- density : ndarray, float Genotype density for each window. bin_edges : ndarray, float The edges of the windows. See Also -------- as_diploid_012, as_n_alt, windowed_genotype_counts, windowed_genotype_rate """ counts, bin_edges = windowed_genotype_counts(pos, gn, t, window_size=window_size, start_position=start_position, stop_position=stop_position) density = counts / np.diff(bin_edges) return density, bin_edges
[docs]def windowed_genotype_rate(pos, gn, t, window_size, start_position=None, stop_position=None): """Compute per-variant rate of genotype calls of a given type for a single sample in non-overlapping windows over the genome. Parameters ---------- pos : array_like, int A sorted 1-dimensional array of genomic positions from a single chromosome/contig. gn : array_like, int A 1-D array of genotypes for a single sample, where each genotype is coded as a single integer. t : int The genotype to count. window_size : int The size in base-pairs of the windows. start_position : int, optional The start position for the region over which to work. stop_position : int, optional The stop position for the region over which to work. Returns ------- rate : ndarray, float Per-variant rate for each window. bin_edges : ndarray, float The edges of the windows. See Also -------- as_diploid_012, as_n_alt, windowed_genotype_counts, windowed_genotype_density """ variant_counts, _ = anhima.loc.windowed_variant_counts( pos, window_size, start_position=start_position, stop_position=stop_position ) counts, bin_edges = windowed_genotype_counts( pos, gn, t, window_size=window_size, start_position=start_position, stop_position=stop_position ) rate = counts / variant_counts return rate, bin_edges
[docs]def plot_windowed_genotype_counts(pos, gn, t, window_size, start_position=None, stop_position=None, ax=None, plot_kwargs=None): """Plots counts of genotype calls of a given type for a single sample in non-overlapping windows over the genome. Parameters ---------- pos : array_like, int A sorted 1-dimensional array of genomic positions from a single chromosome/contig. gn : array_like, int A 1-D array of genotypes for a single sample, where each genotype is coded as a single integer. t : int The genotype to count. window_size : int The size in base-pairs of the windows. start_position : int, optional The start position for the region over which to work. stop_position : int, optional The stop position for the region over which to work. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. plot_kwargs : dict-like Additional keyword arguments passed through to `plt.plot`. Returns ------- ax : axes The axes on which the plot was drawn. """ # set up axes if ax is None: x = plt.rcParams['figure.figsize'][0] fig = plt.figure(figsize=(x, x//3)) ax = fig.add_subplot(111) # count genotypes y, bin_edges = windowed_genotype_counts(pos, gn, t, window_size, start_position=start_position, stop_position=stop_position) x = (bin_edges[1:] + bin_edges[:-1])/2 # plot data if plot_kwargs is None: plot_kwargs = dict() plot_kwargs.setdefault('linestyle', '-') plot_kwargs.setdefault('marker', None) ax.plot(x, y, label=t, **plot_kwargs) # tidy up ax.set_ylim(bottom=0) ax.set_xlabel('position') ax.set_ylabel('counts') if start_position is None: start_position = np.min(pos) if stop_position is None: stop_position = np.max(pos) ax.set_xlim(start_position, stop_position) return ax
[docs]def plot_windowed_genotype_density(pos, gn, t, window_size, start_position=None, stop_position=None, ax=None, plot_kwargs=None): """Plots per-base-pair density of genotype calls of a given type for a single sample in non-overlapping windows over the genome. Parameters ---------- pos : array_like, int A sorted 1-dimensional array of genomic positions from a single chromosome/contig. gn : array_like, int A 1-D array of genotypes for a single sample, where each genotype is coded as a single integer. t : int The genotype to count. window_size : int The size in base-pairs of the windows. start_position : int, optional The start position for the region over which to work. stop_position : int, optional The stop position for the region over which to work. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. plot_kwargs : dict-like Additional keyword arguments passed through to `plt.plot`. Returns ------- ax : axes The axes on which the plot was drawn. """ # set up axes if ax is None: x = plt.rcParams['figure.figsize'][0] fig = plt.figure(figsize=(x, x//3)) ax = fig.add_subplot(111) # count genotypes y, bin_edges = windowed_genotype_density(pos, gn, t, window_size, start_position=start_position, stop_position=stop_position) x = (bin_edges[1:] + bin_edges[:-1])/2 # plot data if plot_kwargs is None: plot_kwargs = dict() plot_kwargs.setdefault('linestyle', '-') plot_kwargs.setdefault('marker', None) ax.plot(x, y, label=t, **plot_kwargs) # tidy up ax.set_ylim(bottom=0) ax.set_xlabel('position') ax.set_ylabel('density') if start_position is None: start_position = np.min(pos) if stop_position is None: stop_position = np.max(pos) ax.set_xlim(start_position, stop_position) return ax
[docs]def plot_windowed_genotype_rate(pos, gn, t, window_size, start_position=None, stop_position=None, ax=None, plot_kwargs=None): """Plots per-variant rate of genotype calls of a given type for a single sample in non-overlapping windows over the genome. Parameters ---------- pos : array_like, int A sorted 1-dimensional array of genomic positions from a single chromosome/contig. gn : array_like, int A 1-D array of genotypes for a single sample, where each genotype is coded as a single integer. t : int The genotype to count. window_size : int The size in base-pairs of the windows. start_position : int, optional The start position for the region over which to work. stop_position : int, optional The stop position for the region over which to work. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. plot_kwargs : dict-like Additional keyword arguments passed through to `plt.plot`. Returns ------- ax : axes The axes on which the plot was drawn. """ # set up axes if ax is None: x = plt.rcParams['figure.figsize'][0] fig = plt.figure(figsize=(x, x//3)) ax = fig.add_subplot(111) # count genotypes y, bin_edges = windowed_genotype_rate(pos, gn, t, window_size, start_position=start_position, stop_position=stop_position) x = (bin_edges[1:] + bin_edges[:-1])/2 # plot data if plot_kwargs is None: plot_kwargs = dict() plot_kwargs.setdefault('linestyle', '-') plot_kwargs.setdefault('marker', None) ax.plot(x, y, label=t, **plot_kwargs) # tidy up ax.set_ylim(bottom=0) ax.set_xlabel('position') ax.set_ylabel('per variant rate') if start_position is None: start_position = np.min(pos) if stop_position is None: stop_position = np.max(pos) ax.set_xlim(start_position, stop_position) return ax
[docs]def plot_discrete_calldata(a, labels=None, colors='wbgrcmyk', states=None, ax=None, pcolormesh_kwargs=None): """ Plot a color grid from discrete calldata (e.g., genotypes). Parameters ---------- a : array_like, int, shape (n_variants, n_samples) 2-dimensional array of integers containing the call data to plot. labels : sequence of strings, optional Axis labels (e.g., sample IDs). colors : sequence, optional Colors to use for different values of the array. states : sequence, optional Manually specify discrete calldata states (if not given will be determined from the data). ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. pcolormesh_kwargs : dict-like, optional Additional keyword arguments passed through to `plt.pcolormesh`. Returns ------- ax : axes The axes on which the plot was drawn. """ # check input array a = np.asarray(a) assert a.ndim == 2 # set up axes if ax is None: fig = plt.figure() ax = fig.add_subplot(111) # determine discrete states if states is None: states = np.unique(a) # determine colors for states colors = colors[:np.max(states)-np.min(states)+1] # plotting defaults if pcolormesh_kwargs is None: pcolormesh_kwargs = dict() pcolormesh_kwargs.setdefault('cmap', mpl.colors.ListedColormap(colors)) pcolormesh_kwargs.setdefault( 'norm', plt.Normalize(np.min(states), np.max(states)+1) ) # plot the colormesh ax.pcolormesh(a.T, **pcolormesh_kwargs) # tidy up ax.set_xlim(0, a.shape[0]) ax.set_ylim(0, a.shape[1]) ax.set_xticks([]) if labels is not None: ax.set_yticks(np.arange(a.shape[1]) + .5) ax.set_yticklabels(labels, rotation=0) else: ax.set_yticks([]) return ax
[docs]def plot_continuous_calldata(a, labels=None, ax=None, pcolormesh_kwargs=None): """ Plot a color grid from continuous calldata (e.g., DP). Parameters ---------- a : array_like, shape (n_variants, n_samples) 2-dimensional array of integers or floats containing the call data to plot. labels : sequence of strings, optional Axis labels (e.g., sample IDs). ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. pcolormesh_kwargs : dict-like, optional Additional keyword arguments passed through to `plt.pcolormesh`. Returns ------- ax : axes The axes on which the plot was drawn. """ # check input array a = np.asarray(a) assert a.ndim == 2 # set up axes if ax is None: fig = plt.figure() ax = fig.add_subplot(111) # plotting defaults if pcolormesh_kwargs is None: pcolormesh_kwargs = dict() pcolormesh_kwargs.setdefault('cmap', 'jet') # plot the color mesh ax.pcolormesh(a.T, **pcolormesh_kwargs) # tidy up ax.set_xlim(0, a.shape[0]) ax.set_ylim(0, a.shape[1]) ax.set_xticks([]) if labels is not None: ax.set_yticks(np.arange(a.shape[1]) + .5) ax.set_yticklabels(labels, rotation=0) else: ax.set_yticks([]) return ax
[docs]def plot_diploid_genotypes(gn, labels=None, colors='wbgr', states=(-1, 0, 1, 2), ax=None, colormesh_kwargs=None): """Plot diploid genotypes as a color grid. Parameters ---------- gn : array_like, int, shape (n_variants, n_samples) An array where each genotype is coded as a single integer as described above. labels : sequence of strings, optional Axis labels (e.g., sample IDs). colors : sequence, optional Colors to use for different values of the array. states : sequence, optional Manually specify discrete calldata states (if not given will be determined from the data). ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. colormesh_kwargs : dict-like Additional keyword arguments passed through to `plt.pcolormesh`. Returns ------- ax : axes The axes on which the plot was drawn. """ return plot_discrete_calldata(gn, labels=labels, colors=colors, states=states, ax=ax, pcolormesh_kwargs=colormesh_kwargs)
[docs]def plot_genotype_counts_by_sample(gn, states=(-1, 0, 1, 2), colors='wbgr', labels=None, ax=None, width=1, orientation='vertical', bar_kwargs=None): """Plot a bar graph of genotype counts by sample. Parameters ---------- gn : array_like, int, shape (n_variants, n_samples) An array where each genotype is coded as a single integer as described above. states : sequence, optional The genotype states to count. colors : sequence, optional Colors to use for corresponding states. labels : sequence of strings, optional Axis labels (e.g., sample IDs). ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. width : float, optional Width of the bars (will be used as height if `orientation` == 'horizontal'). orientation : {'horizontal', 'vertical'} Which type of bar to plot. bar_kwargs : dict-like Additional keyword arguments passed through to `plt.bar`. Returns ------- ax : axes The axes on which the plot was drawn. """ # check input array gn = np.asarray(gn) assert gn.ndim == 2 n_variants = gn.shape[0] n_samples = gn.shape[1] # check orientation assert orientation in ('vertical', 'horizontal') # set up axes if ax is None: fig, ax = plt.subplots() # determine bar positions x = np.arange(n_samples) # plot bars for each type if bar_kwargs is None: bar_kwargs = dict() bar_kwargs.setdefault('linewidth', 0) yc = None for t, c in zip(states, colors): # count genotypes y = count_genotypes(gn, t, axis=0) # plot as bar if orientation == 'vertical': ax.bar(x, y, width=width, bottom=yc, color=c, label=t, **bar_kwargs) else: ax.barh(x, y, height=width, left=yc, color=c, label=t, **bar_kwargs) # keep cumulative count if yc is None: yc = y else: yc += y # tidy up # TODO code smells # set plot limits if orientation == 'vertical': ax.set_ylim(0, n_variants) ax.set_xlim(0, n_samples) else: ax.set_xlim(0, n_variants) ax.set_ylim(0, n_samples) # determine tick labels if labels: if orientation == 'vertical': ax.set_xticks(range(n_samples)) ax.set_xticklabels(labels) else: ax.set_yticks(range(n_samples)) ax.set_yticklabels(labels) else: if orientation == 'vertical': ax.set_xticks([]) else: ax.set_yticks([]) return ax
[docs]def plot_genotype_counts_by_variant(gn, states=(-1, 0, 1, 2), colors='wbgr', ax=None, width=1, orientation='vertical', bar_kwargs=None): """Plot a bar graph of genotype counts by variant. Parameters ---------- gn : array_like, int, shape (n_variants, n_samples) An array where each genotype is coded as a single integer as described above. states : sequence, optional The genotype states to count. colors : sequence, optional Colors to use for corresponding states. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. width : float, optional Width of the bars (will be used as height if `orientation` == 'horizontal'). orientation : {'horizontal', 'vertical'} Which type of bar to plot. bar_kwargs : dict-like Additional keyword arguments passed through to `plt.bar`. Returns ------- ax : axes The axes on which the plot was drawn. """ # check input array gn = np.asarray(gn) assert gn.ndim == 2 n_variants = gn.shape[0] n_samples = gn.shape[1] # check orientation assert orientation in ('vertical', 'horizontal') # set up axes if ax is None: fig, ax = plt.subplots() # determine bar positions x = np.arange(n_variants) # plot bars for each type if bar_kwargs is None: bar_kwargs = dict() bar_kwargs.setdefault('linewidth', 0) yc = None for t, c in zip(states, colors): # count genotypes y = count_genotypes(gn, t, axis=1) # plot as bar if orientation == 'vertical': ax.bar(x, y, width=width, bottom=yc, color=c, label=t, **bar_kwargs) else: ax.barh(x, y, height=width, left=yc, color=c, label=t, **bar_kwargs) # keep cumulative count if yc is None: yc = y else: yc += y # tidy up if orientation == 'vertical': ax.set_xticks([]) ax.set_xlim(0, n_variants) ax.set_ylim(0, n_samples) else: ax.set_yticks([]) ax.set_ylim(0, n_variants) ax.set_xlim(0, n_samples) return ax
[docs]def plot_continuous_calldata_by_sample(a, labels=None, ax=None, orientation='vertical', boxplot_kwargs=None): """Plot a boxplot of continuous call data (e.g., DP) by sample. Parameters ---------- a : array_like, shape (n_variants, n_samples) 2-dimensional array of integers or floats containing the call data to plot. labels : sequence of strings, optional Axis labels (e.g., sample IDs). ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. orientation : {'horizontal', 'vertical'} Which type of bar to plot. boxplot_kwargs : dict-like Additional keyword arguments passed through to `plt.boxplot`. Returns ------- ax : axes The axes on which the plot was drawn. """ # check input array a = np.asarray(a) assert a.ndim == 2 # check orientation assert orientation in ('vertical', 'horizontal') vert = orientation == 'vertical' # set up axes if ax is None: fig, ax = plt.subplots() # plot if boxplot_kwargs is None: boxplot_kwargs = dict() ax.boxplot(a, vert=vert, **boxplot_kwargs) # tidy up n_samples = a.shape[1] if labels: if orientation == 'vertical': ax.set_xticks(range(n_samples)) ax.set_xticklabels(labels) else: ax.set_yticks(range(n_samples)) ax.set_yticklabels(labels) else: if orientation == 'vertical': ax.set_xticks([]) else: ax.set_yticks([]) return ax