Source code for anhima.tree

"""
This module provides some facilities for constructing and plotting trees. It
is mostly a wrapper around a very limited subset of functions from the R
`ape` package (Analyses of Phylogenetics and Evolution).

R must be installed, the `ape` R package must be installed, and the Python
package ``rpy2`` must be installed, e.g.::

    $ apt-get install r-base
    $ pip install rpy2
    $ R
    > install.packages("ape")

See also the examples at:

- http://nbviewer.ipython.org/github/alimanfoo/anhima/blob/master/examples/tree.ipynb

"""  # noqa


from __future__ import division, print_function, unicode_literals, \
    absolute_import


# standard library dependencies
import tempfile


# third party dependencies
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg


_r_initialised = False
r = None
ro = None
grdevices = None
ape = None


def _init_r():
    """Private function to initialise R, only executed when needed.

    """

    global _r_initialised
    global r
    global ro
    global grdevices
    global ape

    if not _r_initialised:

        import rpy2.robjects as ro
        from rpy2.robjects import r
        from rpy2.robjects.numpy2ri import numpy2ri
        from rpy2.robjects.packages import importr
        ro.conversion.py2ri = numpy2ri
        grdevices = importr(b'grDevices')
        ape = importr(
            b'ape',
            robject_translations={
                'delta.plot': 'delta_dot_plot',
                'dist.dna': 'dist_dot_dna',
                'dist.nodes': 'dist_dot_nodes',
                'node.depth': 'node_dot_depth',
                'node.depth.edgelength': 'node_dot_depth_dot_edgelength',
                'node.height': 'node_dot_height',
                'node.height.clado': 'node_dot_height_dot_clado',
                'prop.part': 'prop_dot_part',
            }
        )

        # Define custom R functions to help with coloring tree edges by
        # population. These functions were written by Jacob Almagro-Garcia
        # <jg10@sanger.ac.uk> at the Wellcome Trust Sanger Institute.
        r("""
library(ape)


######################################################################################################################
#' Computes the number of leaves of each group that hang from each branch.
#' @param phylotree A tree of class phylo.
#' @param labelgroups A vector with the group of the tip labels (named with the labels).
#' @return A named matrix with the membership counts for each interior edge of the tree.
######################################################################################################################

computeEdgeGroupCounts <- function(phylotree, labelgroups) {

  labels <- phylotree$tip.label
  num_tips <- length(labels)
  edge_names <- unique(sort(c(phylotree$edge)))

  # This matrix will keep track of the group counts for each edge.
  edge_group_counts <- matrix(0, nrow=length(edge_names), ncol=length(unique(sort(labelgroups))))
  rownames(edge_group_counts) <- edge_names
  colnames(edge_group_counts) <- unique(labelgroups)

  # Init the leaf branches.
  sapply(1:num_tips, function(l) {
    edge_group_counts[as.character(l), as.character(labelgroups[phylotree$tip.label[l]])] <<- 1
  })

  # Sort edges by the value of the descendent
  # The first segment will contain the leaves whereas the second the branches (closer to leaves first).
  # We need to do this because leaves are numbered 1:num_tips and the branches CLOSER to the leaves
  # with higher numbers.
  edges <- phylotree$edge[order(phylotree$edge[,2]),]
  branches <- edges[num_tips:nrow(edges),]
  edges[num_tips:nrow(edges),] <- branches[order(branches[,1],decreasing=T),]
  invisible(apply(edges, 1, function(edge) {
    # Check if we are connecting a leaf.
    if(edge[2] <= num_tips) {
      e <- as.character(edge[1])
      g <- as.character(labelgroups[phylotree$tip.label[edge[2]]])
      edge_group_counts[e,g] <<- edge_group_counts[e,g] + 1
    }
    else {
      e1 <- as.character(edge[1])
      e2 <- as.character(edge[2])
      edge_group_counts[e1,] <<- edge_group_counts[e1,] + edge_group_counts[e2,]
    }
  }))
  return(edge_group_counts)
}


######################################################################################################################
#' Assigns the color of the majority group (hanging from) each branch.
#' @param phylotree A tree of class phylo.
#' @param edge_group_counts A named matrix with the group counts for each branch.
#' @param groupcolors A named vector with the color of each group.
#' @param equality_color The color to be used if there is no majority group.
#' @return A vector with the colors to be used with the tree branches.
######################################################################################################################

assignMajorityGroupColorToEdges <- function(phylotree, edge_group_counts, groupcolors, equality_color="gray") {
  edge_colors <- apply(phylotree$edge, 1, function(branch) {
    e <- as.character(branch[2])
    major_group_index <- which.max(edge_group_counts[e,])
    if(all(edge_group_counts[e,] == edge_group_counts[e,major_group_index]))
      return(equality_color)
    else
      return(groupcolors[colnames(edge_group_counts)[major_group_index]])
  })
  return(edge_colors)
}
""")  # noqa

        _r_initialised = True


[docs]def nj(dist_square, labels=None): """Wrapper for the `ape` ``nj`` function, which performs the neighbor-joining tree estimation of Saitou and Nei (1987). Parameters ---------- dist_square : array_like, shape (`n_samples`, `n_samples`) A pairwise distance matrix in square form. labels : sequence of strings, optional A sequence of strings to label the tips of the tree. Must be in the same order as rows of the distance matrix. Returns ------- An R object of class "phylo". See Also -------- anhima.dist.pairwise_distance """ # setup R _init_r() # normalise inputs dist_square = np.asarray(dist_square) assert dist_square.ndim == 2 assert dist_square.shape[0] == dist_square.shape[1] # convert distance matrix to R m = ro.vectors.Matrix(dist_square) # assign row and column labels if labels: # map all strings to str labels = [str(l) for l in labels] s = ro.StrVector(labels) m.rownames = s m.colnames = s # build the tree tree = ape.nj(m) return tree
[docs]def bionj(dist_square, labels=None): """Wrapper for the `ape` ``bionj`` function, which performs the BIONJ algorithm of Gascuel (1997). Parameters ---------- dist_square : array_like, shape (`n_samples`, `n_samples`) A pairwise distance matrix in square form. labels : sequence of strings, optional A sequence of strings to label the tips of the tree. Must be in the same order as rows of the distance matrix. Returns ------- An R object of class "phylo". See Also -------- anhima.dist.pairwise_distance """ # setup R _init_r() # normalise inputs dist_square = np.asarray(dist_square) assert dist_square.ndim == 2 assert dist_square.shape[0] == dist_square.shape[1] # convert distance matrix to R m = ro.vectors.Matrix(dist_square) # assign row and column labels if labels: # map all strings to str labels = [str(l) for l in labels] s = ro.StrVector(labels) m.rownames = s m.colnames = s # build the tree tree = ape.bionj(m) return tree
[docs]def plot_phylo(tree, plot_kwargs=None, add_scale_bar=None, filename=None, width=None, height=None, units=None, res=None, pointsize=None, bg=None, ax=None, imshow_kwargs=None): """Wrapper for the `ape` ``plot.phylo`` function, which plots phylogenetic trees. Plotting will use the R `png` graphics device. Parameters ---------- tree : R object of class "phylo" The tree to plot. plot_kwargs : dict-like, optional A dictionary of keyword arguments that will be passed through to the `ape` function ``plot.phylo()``. See the documentation for the `ape` package for a full list of supported arguments. add_scale_bar : dict-like, optional A dictionary of keyword arguments that will be passed through to the `ape` function ``add.scale.bar()``. See the documentation for the `ape` package for a full list of supported arguments. filename : string, optional File path for the generated PNG image. If None, a temporary file will be used. width : int or float, optional Width of the plot in `units`. height : int or float, optional Height of the plot in `units`. units : {'px', 'in', 'cm', 'mm'}, optional The units in which 'height' and 'width' are given. Can be 'px' (pixels, the default), 'in' (inches), 'cm' or 'mm'. res : int, optional The nominal resolution in ppi which will be recorded in the bitmap file, if a positive integer. Also used for 'units' other than the default, and to convert points to pixels. pointsize : float, optional The default pointsize of plotted text, interpreted as big points ( 1/72 inch) at 'res' ppi. bg : color, optional The background color. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. imshow_kwargs : dict-like Additional keyword arguments passed through to `imshow()`. Returns ------- ax : axes The axes on which the plot was drawn. """ # setup R _init_r() # setup image file if filename is None: tmp = tempfile.NamedTemporaryFile(suffix='.png') filename = tmp.name # initialise PNG device png_arg_names = 'width', 'height', 'units', 'res', 'pointsize', 'bg' png_args = dict() for n in png_arg_names: v = locals()[n] if v is not None: png_args[n] = v grdevices.png(filename, **png_args) # plot if plot_kwargs is None: plot_kwargs = dict() # adapt values for certain properties for k in 'tip.color', 'edge.color': if k in plot_kwargs: v = plot_kwargs[k] if isinstance(v, (list, tuple, np.ndarray)): plot_kwargs[k] = ro.StrVector(v) ape.plot_phylo(tree, **plot_kwargs) # add scale bar if add_scale_bar is not None: ape.add_scale_bar(**add_scale_bar) # finalise PNG device grdevices.dev_off() # read in PNG for matplotlib plotting png = mpimg.imread(filename) # set up axes for matplotlib plotting if ax is None: # try to make the figure exactly the right size for image native # resolution pxw, pxh = png.shape[:2] dpi = plt.rcParams['savefig.dpi'] w, h = pxw/dpi, pxh/dpi fig, ax = plt.subplots(figsize=(w, h)) # no margin fig.subplots_adjust(0, 0, 1, 1, hspace=0, wspace=0) if imshow_kwargs is None: imshow_kwargs = dict() imshow_kwargs.setdefault('aspect', 'equal') imshow_kwargs.setdefault('interpolation', 'none') ax.imshow(png, **imshow_kwargs) ax.set_axis_off() return ax
[docs]def write_tree(tree, filename=None, **kwargs): """ Wrapper for the `ape` ``write.tree`` function, which writes in a file a tree in parenthetic format using the Newick (also known as New Hampshire) format. Parameters ---------- tree : R object of class "phylo" The tree to be written. filename : string, optional The name of the file to write to. If ommitted, write the file to a string and return it. **kwargs : keyword arguments All further keyword arguments are passed through to ``write.tree``. Returns ------- result : string A string if `filename` is None, otherwise no return value. """ # setup R _init_r() # write the file if filename is None: kwargs['file'] = b'' else: kwargs['file'] = filename result = ape.write_tree(tree, **kwargs) # handle the case where tree is written to stdout if filename is None: return result[0]
[docs]def read_tree(filename, **kwargs): """ Wrapper for the `ape` ``read.tree`` function, which reads a file which contains one or several trees in parenthetic format known as the Newick or New Hampshire format. Parameters ---------- filename : string Name of the file to read. **kwargs : keyword arguments All further keyword arguments are passed through to ``read.tree``. Returns ------- tree : R object of class "phylo" If several trees are read in the file, the returned object is of class "multiPhylo", and is a list of objects of class "phylo". The name of each tree can be specified by tree.names, or can be read from the file (see details). """ # setup R _init_r() kwargs['file'] = filename return ape.read_tree(**kwargs)
[docs]def color_edges_by_group_majority(tree, labels, groups, colors, equality_color=b'gray'): """ Color the edges of a tree according to the majority group membership of the descendant tips. Parameters ---------- tree : R object of class "phylo" The tree containing the edges to be colored. labels : sequence of strings The tip labels. groups : sequence of strings A sequence of strings of the same length as `labels`, where each item is an identifier for the group to which the corresponding tip belongs. colors : dict-like A dictionary mapping groups to colors. equality_color : string, optional The color to use in the event of a tie. Returns ------- edge_colors : list of strings A list of colors for the edges of the tree, to be passed into :func:`plot_phylo`. """ # setup R _init_r() r_groups = ro.StrVector([str(g) for g in groups]) r_groups.names = ro.StrVector([str(l) for l in labels]) counts = r.computeEdgeGroupCounts(tree, r_groups) r_colors = ro.StrVector([str(v) for v in colors.values()]) r_colors.names = ro.StrVector([str(k) for k in colors.keys()]) edge_colors = r.assignMajorityGroupColorToEdges( tree, counts, groupcolors=r_colors, equality_color=equality_color ) return list(edge_colors)