Source code for anhima.gt

# -*- coding: utf-8 -*-
"""
Utility functions for working with genotype data.

See also the examples at:

- http://nbviewer.ipython.org/github/alimanfoo/anhima/blob/master/examples/gt.ipynb

"""  # noqa


from __future__ import division, print_function, absolute_import


# third party dependencies
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import numexpr as ne


# internal dependencies
import anhima.loc


[docs]def is_called(genotypes):
    """Find non-missing genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    is_called : ndarray, bool
        An array where elements are True if the genotype call is non-missing.

    See Also
    --------

    is_missing, is_hom_ref, is_het, is_hom_alt

    Notes
    -----

    Applicable to polyploid genotype calls.

    Applicable to multiallelic variants.

    """

    # check input array has 2 or more dimensions
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1

    # assume ploidy is fastest changing dimension
    dim_ploidy = genotypes.ndim - 1
    ploidy = genotypes.shape[dim_ploidy]
    assert ploidy > 1

    # optimisation for diploid case
    if ploidy == 2:
        allele1 = genotypes[..., 0]  # noqa
        allele2 = genotypes[..., 1]  # noqa
        ex = '(allele1 >= 0) & (allele2 >= 0)'
        out = ne.evaluate(ex)

    # general ploidy case
    else:
        out = np.all(genotypes >= 0, axis=dim_ploidy)

    return out


[docs]def count_called(genotypes, axis=None):
    """Count non-missing genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    axis : int, optional
        The axis along which to count (0 = variants, 1 = samples).

    Returns
    -------

    n : int or array
        If `axis` is None, returns the number of called (i.e., non-missing)
        genotypes. If `axis` is specified, returns the sum along the given
        `axis`.

    See Also
    --------
    is_called

    """

    # count genotypes
    n = np.sum(is_called(genotypes), axis=axis)

    return n


[docs]def is_missing(genotypes):
    """Find missing genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    is_missing: ndarray, bool
        An array where elements are True if the genotype call is missing.

    See Also
    --------

    is_called, is_hom_ref, is_het, is_hom_alt

    Notes
    -----

    Applicable to polyploid genotype calls.

    Applicable to multiallelic variants.

    """

    # check input array has 2 or more dimensions
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1

    # assume ploidy is fastest changing dimension
    dim_ploidy = genotypes.ndim - 1
    ploidy = genotypes.shape[dim_ploidy]
    assert ploidy > 1

    # optimisation for diploid case
    if ploidy == 2:
        allele1 = genotypes[..., 0]  # noqa
        allele2 = genotypes[..., 1]  # noqa
        ex = '(allele1 < 0) | (allele2 < 0)'
        out = ne.evaluate(ex)

    # general ploidy case
    else:
        out = np.any(genotypes < 0, axis=dim_ploidy)

    return out


[docs]def count_missing(genotypes, axis=None):
    """Count non-missing genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    axis : int, optional
        The axis along which to count (0 = variants, 1 = samples).

    Returns
    -------

    n : int or array
        If `axis` is None, returns the number of missing genotypes. If `axis`
        is specified, returns the sum along the given `axis`.

    See Also
    --------
    is_missing

    """

    # count genotypes
    n = np.sum(is_missing(genotypes), axis=axis)

    return n


[docs]def is_hom(genotypes):
    """Find homozygous genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    is_hom : ndarray, bool
        An array where elements are True if the genotype call is homozygous.

    See Also
    --------

    is_called, is_missing, is_hom_ref, is_hom_alt

    Notes
    -----

    Applicable to polyploid genotype calls.

    Applicable to multiallelic variants.

    """

    # check input array has 2 or more dimensions
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1

    # assume ploidy is fastest changing dimension
    dim_ploidy = genotypes.ndim - 1
    ploidy = genotypes.shape[dim_ploidy]
    assert ploidy > 1

    # optimisation for diploid case
    if ploidy == 2:
        allele1 = genotypes[..., 0]  # noqa
        allele2 = genotypes[..., 1]  # noqa
        ex = '(allele1 >= 0) & (allele1 == allele2)'
        out = ne.evaluate(ex)

    # general ploidy case
    else:
        allele1 = genotypes[..., 0, np.newaxis]  # noqa
        other_alleles = genotypes[..., 1:]  # noqa
        ex = '(allele1 >= 0) & (allele1 == other_alleles)'
        out = np.all(ne.evaluate(ex), axis=dim_ploidy)

    return out


[docs]def count_hom(genotypes, axis=None):
    """Count homozygous genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    axis : int, optional
        The axis along which to count (0 = variants, 1 = samples).

    Returns
    -------

    n : int or array
        If `axis` is None, returns the number of homozygous genotypes. If
        `axis` is specified, returns the sum along the given `axis`.

    See Also
    --------
    is_hom

    """

    # count genotypes
    n = np.sum(is_hom(genotypes), axis=axis)

    return n


[docs]def is_het(genotypes):
    """Find heterozygous genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    is_het : ndarray, bool
        An array where elements are True if the genotype call is heterozygous.

    See Also
    --------

    is_called, is_missing, is_hom_ref, is_hom_alt

    Notes
    -----

    Applicable to polyploid genotype calls, although note that all
    types of heterozygous genotype (i.e., anything not completely
    homozygous) will give an element value of True.

    Applicable to multiallelic variants, although note that the element value
    will be True in any case where the two alleles in a genotype are
    different, e.g., (0, 1), (0, 2), (1, 2), etc.

    """

    # check input array has 2 or more dimensions
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1

    # assume ploidy is fastest changing dimension
    dim_ploidy = genotypes.ndim - 1
    ploidy = genotypes.shape[dim_ploidy]
    assert ploidy > 1

    # optimisation for diploid case
    if ploidy == 2:
        allele1 = genotypes[..., 0]
        allele2 = genotypes[..., 1]
        out = allele1 != allele2

    # general ploidy case
    else:
        allele1 = genotypes[..., 0, np.newaxis]
        other_alleles = genotypes[..., 1:]
        out = np.any(allele1 != other_alleles, axis=dim_ploidy)

    return out


[docs]def count_het(genotypes, axis=None):
    """Count heterozygous genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    axis : int, optional
        The axis along which to count (0 = variants, 1 = samples).

    Returns
    -------

    n : int or array
        If `axis` is None, returns the number of heterozygous genotypes. If
        `axis` is specified, returns the sum along the given `axis`.

    See Also
    --------
    is_het

    """

    # count genotypes
    n = np.sum(is_het(genotypes), axis=axis)

    return n


[docs]def is_hom_ref(genotypes):
    """Find homozygous reference genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    is_hom_ref : ndarray, bool
        An array where elements are True if the genotype call is homozygous
        reference.

    See Also
    --------
    is_called, is_missing, is_het, is_hom_alt

    Notes
    -----

    Applicable to polyploid genotype calls.

    Applicable to multiallelic variants.

    """

    # check input array has 2 or more dimensions
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1

    # assume ploidy is fastest changing dimension
    dim_ploidy = genotypes.ndim - 1
    ploidy = genotypes.shape[dim_ploidy]
    assert ploidy > 1

    # optimisation for diploid case
    if ploidy == 2:
        allele1 = genotypes[..., 0]  # noqa
        allele2 = genotypes[..., 1]  # noqa
        ex = '(allele1 == 0) & (allele2 == 0)'
        out = ne.evaluate(ex)

    # general ploidy case
    else:
        out = np.all(genotypes == 0, axis=dim_ploidy)

    return out


[docs]def count_hom_ref(genotypes, axis=None):
    """Count homozygous reference genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    axis : int, optional
        The axis along which to count (0 = variants, 1 = samples).

    Returns
    -------

    n : int or array
        If `axis` is None, returns the number of homozygous
        reference genotypes. If `axis` is specified, returns the sum along
        the given `axis`.

    See Also
    --------
    is_hom_ref

    """

    # count genotypes
    n = np.sum(is_hom_ref(genotypes), axis=axis)

    return n


[docs]def is_hom_alt(genotypes):
    """Find homozygous non-reference genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    is_hom_alt : ndarray, bool
        An array where elements are True if the genotype call is homozygous
        non-reference.

    See Also
    --------

    is_called, is_missing, is_hom_ref, is_het

    Notes
    -----

    Applicable to polyploid genotype calls.

    Applicable to multiallelic variants.

    """

    # check input array has 2 or more dimensions
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1

    # assume ploidy is fastest changing dimension
    dim_ploidy = genotypes.ndim - 1
    ploidy = genotypes.shape[dim_ploidy]
    assert ploidy > 1

    # optimisation for diploid case
    if ploidy == 2:
        allele1 = genotypes[..., 0]  # noqa
        allele2 = genotypes[..., 1]  # noqa
        ex = '(allele1 > 0) & (allele1 == allele2)'
        out = ne.evaluate(ex)

    # general ploidy case
    else:
        allele1 = genotypes[..., 0, np.newaxis]  # noqa
        other_alleles = genotypes[..., 1:]  # noqa
        ex = '(allele1 > 0) & (allele1 == other_alleles)'
        out = np.all(ne.evaluate(ex), axis=dim_ploidy)

    return out


[docs]def count_hom_alt(genotypes, axis=None):
    """Count homozygous non-reference genotype calls.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    axis : int, optional
        The axis along which to count (0 = variants, 1 = samples).

    Returns
    -------

    n : int or array
        If `axis` is None, returns the number of homozygous non-reference
        genotypes. If `axis` is specified, returns the sum along the given
        `axis`.

    See Also
    --------
    is_hom_alt

    """

    # count genotypes
    n = np.sum(is_hom_alt(genotypes), axis=axis)

    return n


[docs]def max_allele(genotypes, axis=None):
    """
    Return the highest allele index.

    Parameters
    ----------

    genotypes : array_like
        An array of shape (n_variants, n_samples, ploidy) where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    axis : int, optional
        The axis along which to determine the maximum (0 = variants,
        1 = samples). If not given, return the highest overall.

    Returns
    -------

    n : int
        The value of the highest allele index present in the genotypes array.

    """

    return np.amax(genotypes, axis=axis)


[docs]def as_haplotypes(genotypes):
    """Reshape an array of genotypes to view it as haplotypes by dropping the
    ploidy dimension.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    haplotypes : ndarray
        An array of shape (n_variants, n_samples * ploidy).

    Notes
    -----

    Note that if genotype calls are unphased, the haplotypes returned by this
    function will bear no resemblance to the true haplotypes.

    Applicable to polyploid genotype calls.

    Applicable to multiallelic variants.

    """

    # check input array
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim == 3

    # reshape, preserving size of first dimension
    newshape = (genotypes.shape[0], -1)
    haplotypes = np.reshape(genotypes, newshape)

    return haplotypes


[docs]def as_n_alt(genotypes):
    """Transform genotypes as the number of non-reference alleles.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    gn : ndarray, uint8
        An array where each genotype is coded as a single integer counting
        the number of alternate alleles.

    See Also
    --------

    as_012

    Notes
    -----

    Applicable to polyploid genotype calls.

    Applicable to multiallelic variants, although this function simply
    counts the number of non-reference alleles, it makes no distinction
    between different non-reference alleles.

    Note that this function returns 0 for missing genotype calls **and** for
    homozygous reference genotype calls, because in both cases the number of
    non-reference alleles is zero.

    """

    # check input array
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1

    # assume ploidy is fastest changing dimension
    dim_ploidy = genotypes.ndim - 1

    # count number of alternate alleles
    gn = np.empty(genotypes.shape[:-1], dtype='u1')
    np.sum(genotypes > 0, axis=dim_ploidy, out=gn)

    return gn


[docs]def as_012(genotypes, fill=-1):
    """Transform genotypes recoding homozygous reference calls a
    0, heterozygous calls as 1, homozygous non-reference calls as 2, and
    missing calls as -1.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    fill : int, optional
        Default value for missing calls.

    Returns
    -------

    gn : ndarray, int8
        An array where each genotype is coded as a single integer as
        described above.

    See Also
    --------

    as_nalt

    Notes
    -----

    Applicable to polyploid genotype calls, although note that all
    types of heterozygous genotype (i.e., anything not completely
    homozygous) will be coded as 1.

    Applicable to multiallelic variants, although note the following.
    All heterozygous genotypes, e.g., (0, 1), (0, 2), (1, 2), ..., will be
    coded as 1. All homozygous non-reference genotypes, e.g., (1, 1), (2, 2),
    ..., will be coded as 2.

    """

    # check input array
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1

    # set up output array
    gn = np.empty(genotypes.shape[:-1], dtype='i1')
    gn.fill(fill)

    # determine genotypes
    gn[is_hom_ref(genotypes)] = 0
    gn[is_het(genotypes)] = 1
    gn[is_hom_alt(genotypes)] = 2

    return gn


[docs]def as_allele_counts(genotypes, alleles=None):
    """Transform genotypes into allele counts per call.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    alleles : sequence of ints, optional
        The alleles to count. If not specified, all alleles will be counted.

    Returns
    -------

    gac : ndarray, uint8
        An array where the ploidy dimension has been replaced by counts of
        each allele.

    """

    # check inputs
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1
    dim_ploidy = genotypes.ndim - 1
    if alleles is None:
        m = np.amax(genotypes)
        alleles = range(m+1)

    # count alleles along ploidy dimension
    gac = np.zeros(genotypes.shape[:-1] + (len(alleles),), dtype='u1')
    for i, allele in enumerate(alleles):
        np.sum(genotypes == allele, axis=dim_ploidy, out=gac[..., i])

    return gac


[docs]def pack_diploid(genotypes):
    """
    Pack diploid genotypes into a single byte for each genotype,
    using the left-most 4 bits for the first allele and the right-most 4 bits
    for the second allele. Allows single byte encoding of diploid genotypes
    for variants with up to 15 alleles.

    Parameters
    ----------

    genotypes : array_like, int
        An array of shape (n_variants, n_samples, ploidy) or
        (n_variants, ploidy) or (n_samples, ploidy), where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).

    Returns
    -------

    packed : ndarray, int8
        An array of genotypes where the ploidy dimension has been collapsed
        by bit packing the two alleles for each genotype into a single byte.

    See Also
    --------

    unpack_diploid_genotypes

    """

    # check inputs
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim > 1
    assert np.amax(genotypes) < 15, 'max allele is 14'
    assert np.amin(genotypes) > -2, 'min allele is -1'
    genotypes = genotypes.astype('u1')

    # add 1 to handle missing alleles coded as -1
    genotypes = genotypes + 1

    # left shift first allele by 4 bits
    a1 = np.left_shift(genotypes[..., 0], 4)

    # mask left-most 4 bits to ensure second allele doesn't clash with first
    # allele
    a2 = np.bitwise_and(genotypes[..., 1], 15)

    # pack them
    packed = np.bitwise_or(a1, a2)

    # rotate round so that hom ref calls are encoded as 0, better for sparse
    # matrices
    packed -= 17

    return packed


[docs]def unpack_diploid(packed):
    """
    Unpack an array of diploid genotypes that have been bit packed into
    single bytes.

    Parameters
    ----------

    packed : array_like
        An array of genotypes where the ploidy dimension has been collapsed
        by bit packing the two alleles for each genotype into a single byte.

    Returns
    -------

    genotypes : ndarray, int8
        An array of genotypes where the ploidy dimension has been restored by
        unpacking the input array.

    See Also
    --------

    pack_diploid_genotypes

    """

    # check inputs
    packed = np.asarray(packed).astype('u1')
    assert 1 <= packed.ndim <= 2

    # rotate back round so missing calls are encoded as 0
    packed = packed + 17

    # set up output array
    genotypes = np.empty(packed.shape + (2,), dtype='u1')
    a1 = genotypes[..., 0]
    a2 = genotypes[..., 1]

    # right shift 4 bits to extract first allele
    np.right_shift(packed, 4, out=a1)

    # mask left-most 4 bits to extract second allele
    np.bitwise_and(packed, 15, out=a2)

    # subtract 1 to restore coding of missing alleles as -1
    genotypes -= 1
    genotypes = genotypes.astype('i1')

    return genotypes


# packed representation of some common diploid genotypes
BHOM00 = 0
BHET01 = 1
BHOM11 = 17
BMISSING = 239


[docs]def count_genotypes(gn, t, axis=None):
    """Count genotypes of a given type.

    Parameters
    ----------

    gn : array_like, int
        An array of shape (n_variants, n_samples) or (n_variants,) or
        (n_samples,) where each element is a genotype called coded as a
        single integer.
    t : int
        The genotype to count.
    axis : int, optional
        The axis along which to count (0 = variants, 1 = samples).

    Returns
    -------

    n : int or array
        If `axis` is None, returns the total number of matching genotypes. If
        `axis` is specified, returns the sum along the given `axis`.

    """

    # normalise inputs
    gn = np.asarray(gn)

    # count genotypes
    n = np.sum(gn == t, axis=axis)

    return n


[docs]def windowed_genotype_counts(pos, gn, t, window_size, start_position=None,
                             stop_position=None):
    """Count genotype calls of a given type for a single sample in
    non-overlapping windows over the genome.

    Parameters
    ----------

    pos : array_like, int
        A sorted 1-dimensional array of genomic positions from a single
        chromosome/contig.
    gn : array_like, int
        A 1-D array of genotypes for a single sample, where each genotype is
        coded as a single integer.
    t : int
        The genotype to count.
    window_size : int
        The size in base-pairs of the windows.
    start_position : int, optional
        The start position for the region over which to work.
    stop_position : int, optional
        The stop position for the region over which to work.

    Returns
    -------

    counts : ndarray, int
        Genotype counts for each window.
    bin_edges : ndarray, float
        The edges of the windows.

    See Also
    --------

    as_diploid_012, as_n_alt, windowed_genotype_density, windowed_genotype_rate

    """

    # check input array
    gn = np.asarray(gn)
    assert gn.ndim == 1

    # find matching genotypes
    values = gn == t

    # computed binned statistic
    counts, bin_edges = anhima.loc.windowed_statistic(
        pos, values=values, statistic='sum', window_size=window_size,
        start_position=start_position, stop_position=stop_position
    )

    return counts, bin_edges


[docs]def windowed_genotype_density(pos, gn, t, window_size, start_position=None,
                              stop_position=None):
    """Compute per-base-pair density of genotype calls of a given type for a
    single sample in non-overlapping windows over the genome.

    Parameters
    ----------

    pos : array_like, int
        A sorted 1-dimensional array of genomic positions from a single
        chromosome/contig.
    gn : array_like, int
        A 1-D array of genotypes for a single sample, where each genotype is
        coded as a single integer.
    t : int
        The genotype to count.
    window_size : int
        The size in base-pairs of the windows.
    start_position : int, optional
        The start position for the region over which to work.
    stop_position : int, optional
        The stop position for the region over which to work.

    Returns
    -------

    density : ndarray, float
        Genotype density for each window.
    bin_edges : ndarray, float
        The edges of the windows.

    See Also
    --------

    as_diploid_012, as_n_alt, windowed_genotype_counts, windowed_genotype_rate

    """

    counts, bin_edges = windowed_genotype_counts(pos, gn, t,
                                                 window_size=window_size,
                                                 start_position=start_position,
                                                 stop_position=stop_position)
    density = counts / np.diff(bin_edges)
    return density, bin_edges


[docs]def windowed_genotype_rate(pos, gn, t, window_size, start_position=None,
                           stop_position=None):
    """Compute per-variant rate of genotype calls of a given type for a
    single sample in non-overlapping windows over the genome.

    Parameters
    ----------

    pos : array_like, int
        A sorted 1-dimensional array of genomic positions from a single
        chromosome/contig.
    gn : array_like, int
        A 1-D array of genotypes for a single sample, where each genotype is
        coded as a single integer.
    t : int
        The genotype to count.
    window_size : int
        The size in base-pairs of the windows.
    start_position : int, optional
        The start position for the region over which to work.
    stop_position : int, optional
        The stop position for the region over which to work.

    Returns
    -------

    rate : ndarray, float
        Per-variant rate for each window.
    bin_edges : ndarray, float
        The edges of the windows.

    See Also
    --------

    as_diploid_012, as_n_alt, windowed_genotype_counts,
    windowed_genotype_density

    """

    variant_counts, _ = anhima.loc.windowed_variant_counts(
        pos, window_size, start_position=start_position,
        stop_position=stop_position
    )
    counts, bin_edges = windowed_genotype_counts(
        pos, gn, t, window_size=window_size, start_position=start_position,
        stop_position=stop_position
    )
    rate = counts / variant_counts
    return rate, bin_edges


[docs]def plot_windowed_genotype_counts(pos, gn, t, window_size, start_position=None,
                                  stop_position=None, ax=None,
                                  plot_kwargs=None):
    """Plots counts of genotype calls of a given type for a single sample in
    non-overlapping windows over the genome.

    Parameters
    ----------

    pos : array_like, int
        A sorted 1-dimensional array of genomic positions from a single
        chromosome/contig.
    gn : array_like, int
        A 1-D array of genotypes for a single sample, where each genotype is
        coded as a single integer.
    t : int
        The genotype to count.
    window_size : int
        The size in base-pairs of the windows.
    start_position : int, optional
        The start position for the region over which to work.
    stop_position : int, optional
        The stop position for the region over which to work.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    plot_kwargs : dict-like
        Additional keyword arguments passed through to `plt.plot`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # set up axes
    if ax is None:
        x = plt.rcParams['figure.figsize'][0]
        fig = plt.figure(figsize=(x, x//3))
        ax = fig.add_subplot(111)

    # count genotypes
    y, bin_edges = windowed_genotype_counts(pos, gn, t, window_size,
                                            start_position=start_position,
                                            stop_position=stop_position)
    x = (bin_edges[1:] + bin_edges[:-1])/2

    # plot data
    if plot_kwargs is None:
        plot_kwargs = dict()
    plot_kwargs.setdefault('linestyle', '-')
    plot_kwargs.setdefault('marker', None)
    ax.plot(x, y, label=t, **plot_kwargs)

    # tidy up
    ax.set_ylim(bottom=0)
    ax.set_xlabel('position')
    ax.set_ylabel('counts')
    if start_position is None:
        start_position = np.min(pos)
    if stop_position is None:
        stop_position = np.max(pos)
    ax.set_xlim(start_position, stop_position)

    return ax


[docs]def plot_windowed_genotype_density(pos, gn, t, window_size,
                                   start_position=None,
                                   stop_position=None, ax=None,
                                   plot_kwargs=None):
    """Plots per-base-pair density of genotype calls of a given type for a
    single sample in non-overlapping windows over the genome.

    Parameters
    ----------

    pos : array_like, int
        A sorted 1-dimensional array of genomic positions from a single
        chromosome/contig.
    gn : array_like, int
        A 1-D array of genotypes for a single sample, where each genotype is
        coded as a single integer.
    t : int
        The genotype to count.
    window_size : int
        The size in base-pairs of the windows.
    start_position : int, optional
        The start position for the region over which to work.
    stop_position : int, optional
        The stop position for the region over which to work.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    plot_kwargs : dict-like
        Additional keyword arguments passed through to `plt.plot`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # set up axes
    if ax is None:
        x = plt.rcParams['figure.figsize'][0]
        fig = plt.figure(figsize=(x, x//3))
        ax = fig.add_subplot(111)

    # count genotypes
    y, bin_edges = windowed_genotype_density(pos, gn, t, window_size,
                                             start_position=start_position,
                                             stop_position=stop_position)
    x = (bin_edges[1:] + bin_edges[:-1])/2

    # plot data
    if plot_kwargs is None:
        plot_kwargs = dict()
    plot_kwargs.setdefault('linestyle', '-')
    plot_kwargs.setdefault('marker', None)
    ax.plot(x, y, label=t, **plot_kwargs)

    # tidy up
    ax.set_ylim(bottom=0)
    ax.set_xlabel('position')
    ax.set_ylabel('density')
    if start_position is None:
        start_position = np.min(pos)
    if stop_position is None:
        stop_position = np.max(pos)
    ax.set_xlim(start_position, stop_position)

    return ax


[docs]def plot_windowed_genotype_rate(pos, gn, t, window_size,
                                start_position=None,
                                stop_position=None, ax=None,
                                plot_kwargs=None):
    """Plots per-variant rate of genotype calls of a given type for a
    single sample in non-overlapping windows over the genome.

    Parameters
    ----------

    pos : array_like, int
        A sorted 1-dimensional array of genomic positions from a single
        chromosome/contig.
    gn : array_like, int
        A 1-D array of genotypes for a single sample, where each genotype is
        coded as a single integer.
    t : int
        The genotype to count.
    window_size : int
        The size in base-pairs of the windows.
    start_position : int, optional
        The start position for the region over which to work.
    stop_position : int, optional
        The stop position for the region over which to work.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    plot_kwargs : dict-like
        Additional keyword arguments passed through to `plt.plot`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # set up axes
    if ax is None:
        x = plt.rcParams['figure.figsize'][0]
        fig = plt.figure(figsize=(x, x//3))
        ax = fig.add_subplot(111)

    # count genotypes
    y, bin_edges = windowed_genotype_rate(pos, gn, t, window_size,
                                          start_position=start_position,
                                          stop_position=stop_position)
    x = (bin_edges[1:] + bin_edges[:-1])/2

    # plot data
    if plot_kwargs is None:
        plot_kwargs = dict()
    plot_kwargs.setdefault('linestyle', '-')
    plot_kwargs.setdefault('marker', None)
    ax.plot(x, y, label=t, **plot_kwargs)

    # tidy up
    ax.set_ylim(bottom=0)
    ax.set_xlabel('position')
    ax.set_ylabel('per variant rate')
    if start_position is None:
        start_position = np.min(pos)
    if stop_position is None:
        stop_position = np.max(pos)
    ax.set_xlim(start_position, stop_position)

    return ax


[docs]def plot_discrete_calldata(a, labels=None, colors='wbgrcmyk', states=None,
                           ax=None, pcolormesh_kwargs=None):
    """
    Plot a color grid from discrete calldata (e.g., genotypes).

    Parameters
    ----------

    a : array_like, int, shape (n_variants, n_samples)
        2-dimensional array of integers containing the call data to plot.
    labels : sequence of strings, optional
        Axis labels (e.g., sample IDs).
    colors : sequence, optional
        Colors to use for different values of the array.
    states : sequence, optional
        Manually specify discrete calldata states (if not given will be
        determined from the data).
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    pcolormesh_kwargs : dict-like, optional
        Additional keyword arguments passed through to `plt.pcolormesh`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # check input array
    a = np.asarray(a)
    assert a.ndim == 2

    # set up axes
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111)

    # determine discrete states
    if states is None:
        states = np.unique(a)

    # determine colors for states
    colors = colors[:np.max(states)-np.min(states)+1]

    # plotting defaults
    if pcolormesh_kwargs is None:
        pcolormesh_kwargs = dict()
    pcolormesh_kwargs.setdefault('cmap', mpl.colors.ListedColormap(colors))
    pcolormesh_kwargs.setdefault(
        'norm', plt.Normalize(np.min(states), np.max(states)+1)
    )

    # plot the colormesh
    ax.pcolormesh(a.T, **pcolormesh_kwargs)

    # tidy up
    ax.set_xlim(0, a.shape[0])
    ax.set_ylim(0, a.shape[1])
    ax.set_xticks([])
    if labels is not None:
        ax.set_yticks(np.arange(a.shape[1]) + .5)
        ax.set_yticklabels(labels, rotation=0)
    else:
        ax.set_yticks([])

    return ax


[docs]def plot_continuous_calldata(a, labels=None, ax=None, pcolormesh_kwargs=None):
    """
    Plot a color grid from continuous calldata (e.g., DP).

    Parameters
    ----------

    a : array_like, shape (n_variants, n_samples)
        2-dimensional array of integers or floats containing the call data to
        plot.
    labels : sequence of strings, optional
        Axis labels (e.g., sample IDs).
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    pcolormesh_kwargs : dict-like, optional
        Additional keyword arguments passed through to `plt.pcolormesh`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # check input array
    a = np.asarray(a)
    assert a.ndim == 2

    # set up axes
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111)

    # plotting defaults
    if pcolormesh_kwargs is None:
        pcolormesh_kwargs = dict()
    pcolormesh_kwargs.setdefault('cmap', 'jet')

    # plot the color mesh
    ax.pcolormesh(a.T, **pcolormesh_kwargs)

    # tidy up
    ax.set_xlim(0, a.shape[0])
    ax.set_ylim(0, a.shape[1])
    ax.set_xticks([])
    if labels is not None:
        ax.set_yticks(np.arange(a.shape[1]) + .5)
        ax.set_yticklabels(labels, rotation=0)
    else:
        ax.set_yticks([])

    return ax


[docs]def plot_diploid_genotypes(gn,
                           labels=None,
                           colors='wbgr',
                           states=(-1, 0, 1, 2),
                           ax=None,
                           colormesh_kwargs=None):
    """Plot diploid genotypes as a color grid.

    Parameters
    ----------

    gn : array_like, int, shape (n_variants, n_samples)
        An array where each genotype is coded as a single integer as
        described above.
    labels : sequence of strings, optional
        Axis labels (e.g., sample IDs).
    colors : sequence, optional
        Colors to use for different values of the array.
    states : sequence, optional
        Manually specify discrete calldata states (if not given will be
        determined from the data).
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    colormesh_kwargs : dict-like
        Additional keyword arguments passed through to `plt.pcolormesh`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    return plot_discrete_calldata(gn, labels=labels, colors=colors,
                                  states=states, ax=ax,
                                  pcolormesh_kwargs=colormesh_kwargs)


[docs]def plot_genotype_counts_by_sample(gn, states=(-1, 0, 1, 2),
                                   colors='wbgr', labels=None,
                                   ax=None, width=1, orientation='vertical',
                                   bar_kwargs=None):
    """Plot a bar graph of genotype counts by sample.

    Parameters
    ----------

    gn : array_like, int, shape (n_variants, n_samples)
        An array where each genotype is coded as a single integer as
        described above.
    states : sequence, optional
        The genotype states to count.
    colors : sequence, optional
        Colors to use for corresponding states.
    labels : sequence of strings, optional
        Axis labels (e.g., sample IDs).
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    width : float, optional
        Width of the bars (will be used as height if `orientation` ==
        'horizontal').
    orientation : {'horizontal', 'vertical'}
        Which type of bar to plot.
    bar_kwargs : dict-like
        Additional keyword arguments passed through to `plt.bar`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # check input array
    gn = np.asarray(gn)
    assert gn.ndim == 2
    n_variants = gn.shape[0]
    n_samples = gn.shape[1]

    # check orientation
    assert orientation in ('vertical', 'horizontal')

    # set up axes
    if ax is None:
        fig, ax = plt.subplots()

    # determine bar positions
    x = np.arange(n_samples)

    # plot bars for each type
    if bar_kwargs is None:
        bar_kwargs = dict()
    bar_kwargs.setdefault('linewidth', 0)
    yc = None
    for t, c in zip(states, colors):

        # count genotypes
        y = count_genotypes(gn, t, axis=0)

        # plot as bar
        if orientation == 'vertical':
            ax.bar(x, y, width=width, bottom=yc, color=c, label=t,
                   **bar_kwargs)
        else:
            ax.barh(x, y, height=width, left=yc, color=c, label=t,
                    **bar_kwargs)

        # keep cumulative count
        if yc is None:
            yc = y
        else:
            yc += y

    # tidy up
    # TODO code smells

    # set plot limits
    if orientation == 'vertical':
        ax.set_ylim(0, n_variants)
        ax.set_xlim(0, n_samples)
    else:
        ax.set_xlim(0, n_variants)
        ax.set_ylim(0, n_samples)

    # determine tick labels
    if labels:
        if orientation == 'vertical':
            ax.set_xticks(range(n_samples))
            ax.set_xticklabels(labels)
        else:
            ax.set_yticks(range(n_samples))
            ax.set_yticklabels(labels)
    else:
        if orientation == 'vertical':
            ax.set_xticks([])
        else:
            ax.set_yticks([])

    return ax


[docs]def plot_genotype_counts_by_variant(gn, states=(-1, 0, 1, 2),
                                    colors='wbgr', ax=None, width=1,
                                    orientation='vertical',
                                    bar_kwargs=None):
    """Plot a bar graph of genotype counts by variant.

    Parameters
    ----------

    gn : array_like, int, shape (n_variants, n_samples)
        An array where each genotype is coded as a single integer as
        described above.
    states : sequence, optional
        The genotype states to count.
    colors : sequence, optional
        Colors to use for corresponding states.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    width : float, optional
        Width of the bars (will be used as height if `orientation` ==
        'horizontal').
    orientation : {'horizontal', 'vertical'}
        Which type of bar to plot.
    bar_kwargs : dict-like
        Additional keyword arguments passed through to `plt.bar`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # check input array
    gn = np.asarray(gn)
    assert gn.ndim == 2
    n_variants = gn.shape[0]
    n_samples = gn.shape[1]

    # check orientation
    assert orientation in ('vertical', 'horizontal')

    # set up axes
    if ax is None:
        fig, ax = plt.subplots()

    # determine bar positions
    x = np.arange(n_variants)

    # plot bars for each type
    if bar_kwargs is None:
        bar_kwargs = dict()
    bar_kwargs.setdefault('linewidth', 0)
    yc = None
    for t, c in zip(states, colors):

        # count genotypes
        y = count_genotypes(gn, t, axis=1)

        # plot as bar
        if orientation == 'vertical':
            ax.bar(x, y, width=width, bottom=yc, color=c, label=t,
                   **bar_kwargs)
        else:
            ax.barh(x, y, height=width, left=yc, color=c, label=t,
                    **bar_kwargs)

        # keep cumulative count
        if yc is None:
            yc = y
        else:
            yc += y

    # tidy up
    if orientation == 'vertical':
        ax.set_xticks([])
        ax.set_xlim(0, n_variants)
        ax.set_ylim(0, n_samples)
    else:
        ax.set_yticks([])
        ax.set_ylim(0, n_variants)
        ax.set_xlim(0, n_samples)

    return ax


[docs]def plot_continuous_calldata_by_sample(a, labels=None,
                                       ax=None,
                                       orientation='vertical',
                                       boxplot_kwargs=None):
    """Plot a boxplot of continuous call data (e.g., DP) by sample.

    Parameters
    ----------

    a : array_like, shape (n_variants, n_samples)
        2-dimensional array of integers or floats containing the call data to
        plot.
    labels : sequence of strings, optional
        Axis labels (e.g., sample IDs).
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    orientation : {'horizontal', 'vertical'}
        Which type of bar to plot.
    boxplot_kwargs : dict-like
        Additional keyword arguments passed through to `plt.boxplot`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn.

    """

    # check input array
    a = np.asarray(a)
    assert a.ndim == 2

    # check orientation
    assert orientation in ('vertical', 'horizontal')
    vert = orientation == 'vertical'

    # set up axes
    if ax is None:
        fig, ax = plt.subplots()

    # plot
    if boxplot_kwargs is None:
        boxplot_kwargs = dict()
    ax.boxplot(a, vert=vert, **boxplot_kwargs)

    # tidy up
    n_samples = a.shape[1]
    if labels:
        if orientation == 'vertical':
            ax.set_xticks(range(n_samples))
            ax.set_xticklabels(labels)
        else:
            ax.set_yticks(range(n_samples))
            ax.set_yticklabels(labels)
    else:
        if orientation == 'vertical':
            ax.set_xticks([])
        else:
            ax.set_yticks([])

    return ax