Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@ and this project adheres to [Semantic Versioning][].
## [Unrealeased]

### Added
- Added `pycea.tl.expansion_test` for computing expansion p-values to detect clades under selection

- `pycea.tl.partition_test` to test for statistically significant differences between leaf partitions. (#40)
- `pycea.tl.expansion_test` for computing expansion p-values to detect expanding clades. (#38)

### Changed

- Replaced `tdata.obs_keys()` with `tdata.obs.keys()` to conform with anndata API changes. (#41)
- `pycea.tl.fitness` no longer returns a multi-indexed DataFrame when `tdata` contains a single tree. (#38)

### Fixed

Expand Down
1 change: 1 addition & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
tl.n_extant
tl.fitness
tl.partition_test
tl.expansion_test
```

## Plotting
Expand Down
321 changes: 275 additions & 46 deletions docs/notebooks/growth-dynamics.ipynb

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions docs/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,22 @@ @article {Neher_2014
issn = {2050-084X},
publisher = {eLife Sciences Publications, Ltd},
}

@article{Griffiths_1998,
title = {The age of a mutation in a general coalescent tree},
volume = {14},
issn = {0882-0287},
url = {https://doi.org/10.1080/15326349808807471},
doi = {10.1080/15326349808807471},
abstract = {Kimura and Ohta showed that the expected age of a neutral mutation observed to be of frequency x in a population is We put this classical result in a general coalescent process context that allows questions to be asked about mutations in a sample, as well as in the population. In the general context the population size may vary back in time. Assuming an infmitely-many-sites model of mutation, we find the distribution of the number of mutant genes at a particular site in a sample; the probability that an allele at that site of a given frequency is ancestral; the distribution of the age of a mutation given its frequency in a sample, or population; and the distribution of the time to the most recent common ancestor, given the frequency of a mutation in a sample, or in the population},
number = {1-2},
urldate = {2025-11-13},
journal = {Communications in Statistics. Stochastic Models},
author = {Griffiths, R.C. and Tavaré, Simon},
month = jan,
year = {1998},
note = {Publisher: Taylor \& Francis
\_eprint: https://doi.org/10.1080/15326349808807471},
keywords = {Age of a mutation, Coalescent process, Population Genetics, Samples of DNA},
pages = {273--295},
}
1 change: 1 addition & 0 deletions src/pycea/tl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@
from .neighbor_distance import neighbor_distance
from .partition_test import partition_test
from .sort import sort
from .topology import expansion_test
from .tree_distance import tree_distance
from .tree_neighbors import tree_neighbors
1 change: 0 additions & 1 deletion src/pycea/tl/ancestral_states.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,4 +353,3 @@ def ancestral_states(
_ancestral_states(t, key_added, method, costs, missing_state, default_state)
if copy:
return get_keyed_node_data(tdata, keys_added, tree_keys, slot="obst")
return get_keyed_node_data(tdata, keys_added, tree_keys)
1 change: 0 additions & 1 deletion src/pycea/tl/clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,4 +185,3 @@ def clades(
tdata.obs[key_added] = tdata.obs.index.map(node_to_clade[key_added])
if copy:
return pd.concat(lcas)
return pd.concat(lcas)
6 changes: 4 additions & 2 deletions src/pycea/tl/fitness.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,8 @@ def fitness(
leaf_fitness = get_keyed_leaf_data(tdata, key_added, tree_keys)
tdata.obs[key_added] = tdata.obs.index.map(leaf_fitness[key_added])
if copy:
return get_keyed_node_data(tdata, key_added, tree_keys)
return None
df = get_keyed_node_data(tdata, key_added, tree_keys)
if len(trees) == 1:
df.index = df.index.droplevel(0)
return df
return None
128 changes: 128 additions & 0 deletions src/pycea/tl/topology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from collections.abc import Sequence
from typing import Literal, overload

import networkx as nx
import pandas as pd
import treedata as td
from scipy.special import comb as nCk

from pycea.utils import get_keyed_node_data, get_root, get_trees


@overload
def expansion_test(
tdata: td.TreeData,
tree: str | Sequence[str] | None = None,
min_clade_size: int = 10,
min_depth: int = 1,
key_added: str = "expansion_pvalue",
copy: Literal[True, False] = True,
) -> pd.DataFrame: ...
@overload
def expansion_test(
tdata: td.TreeData,
tree: str | Sequence[str] | None = None,
min_clade_size: int = 10,
min_depth: int = 1,
key_added: str = "expansion_pvalue",
copy: Literal[True, False] = False,
) -> None: ...
def expansion_test(
tdata: td.TreeData,
tree: str | Sequence[str] | None = None,
min_clade_size: int = 10,
min_depth: int = 1,
key_added: str = "expansion_pvalue",
copy: Literal[True, False] = False,
) -> pd.DataFrame | None:
"""Compute expansion p-values on a tree.

Uses the methodology described in :cite:`Yang_2022` to
assess the expansion probability of a given subclade of a phylogeny.
Mathematical treatment of the coalescent probability is described in :cite:`Griffiths_1998`.

The probability computed corresponds to the probability that, under a simple
neutral coalescent model, a given subclade contains the observed number of
cells; in other words, a one-sided p-value. Often, if the probability is
less than some threshold (e.g., 0.05), this might indicate that there exists
some subclade under this node to which this expansion probability can be
attributed (i.e. the null hypothesis that the subclade is undergoing
neutral drift can be rejected).

This function will add an attribute to tree nodes storing the expansion p-value.

On a typical balanced tree, this function performs in O(n) time.

Parameters
----------
tdata
TreeData object containing a phylogenetic tree.
min_clade_size
Minimum number of leaves in a subtree to be considered. Default is 10.
min_depth
Minimum depth of clade to be considered. Depth is measured in number
of nodes from the root, not branch lengths. Default is 1.
tree
The `obst` key or keys of the trees to use. If `None`, all trees are used.
key_added
Attribute key where expansion p-values will be stored in tree nodes.
Default is "expansion_pvalue".
copy
If True, return a copy of the TreeData with attributes added.
If False, modify in place and return None. Default is False.

Returns
-------
Returns `None` if ``copy=False``, otherwise returns a :class:`pandas.DataFrame` with expansion pvalues.

Sets the following fields:

* tdata.obst[tree].nodes[key_added] : `float`
- Expansion pvalue for each node.
"""
trees = get_trees(tdata, tree)

for _tree_key, t in trees.items():
root = get_root(t)
# instantiate attributes
leaf_counts = {}
for node in nx.dfs_postorder_nodes(t, root):
if t.out_degree(node) == 0:
leaf_counts[node] = 1
else:
leaf_counts[node] = sum(leaf_counts[child] for child in t.successors(node))

depths = {root: 0}
for u, v in nx.dfs_edges(t, root):
depths[v] = depths[u] + 1

nx.set_node_attributes(t, 1.0, key_added)

for node in t.nodes():
n = leaf_counts[node]
children = list(t.successors(node))
k = len(children)

if k == 0:
continue

for child in children:
b = leaf_counts[child]
depth = depths[child]

# Apply filters
if b < min_clade_size:
continue
if depth < min_depth:
continue

p = nCk(n - b, k - 1) / nCk(n - 1, k - 1)
t.nodes[child][key_added] = float(p)

if copy:
df = get_keyed_node_data(tdata, keys=key_added, tree=tree, slot="obst")
if len(trees) == 1:
df.index = df.index.droplevel(0)
return df
else:
return None
4 changes: 2 additions & 2 deletions tests/test_fitness.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ def test_fitness_lbi_copy(tdata):
out = fitness(tdata, method="lbi", copy=True, random_state=42)
assert isinstance(out, pd.DataFrame)
assert "fitness" in out.columns
assert out.loc[("tree", "B"), "fitness"] == pytest.approx(1.120, abs=1e-3)
assert out.loc["B", "fitness"] == pytest.approx(1.120, abs=1e-3)
tree = tdata.obst["tree"]
assert all("fitness" in tree.nodes[n] for n in tree.nodes)
assert set(out.droplevel(0).index) == set(tree.nodes)
assert set(out.index) == set(tree.nodes)


def test_bad_input(tdata):
Expand Down
146 changes: 146 additions & 0 deletions tests/test_topology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import networkx as nx
import pandas as pd
import pytest
import treedata as td

import pycea as py
from pycea.tl.topology import expansion_test


@pytest.fixture
def test_tree():
"""Create a test TreeData object with a tree topology."""
# Create tree topology
tree = nx.DiGraph()
tree.add_edges_from(
[
("0", "1"),
("0", "2"),
("1", "3"),
("1", "4"),
("1", "5"),
("2", "6"),
("2", "7"),
("3", "8"),
("3", "9"),
("3", "16"),
("7", "10"),
("7", "11"),
("8", "12"),
("8", "13"),
("9", "14"),
("9", "15"),
("16", "17"),
("16", "18"),
]
)

# Create TreeData object
tdata = td.TreeData(
obs=pd.DataFrame(index=["4", "5", "6", "10", "11", "12", "13", "14", "15", "17", "18"]),
obst={"tree": tree},
)

return tdata


def test_expansion_test_min_clade(test_tree):
"""Test that min_clade_size=20 filters out all clades."""
expansion_test(test_tree, min_clade_size=20)
node_data = py.get.node_df(test_tree)
assert (node_data["expansion_pvalue"] == 1.0).all(), "All nodes should be filtered with min_clade_size=20"


def test_expansion_test_basic(test_tree):
"""Test expansion p-values with min_clade_size=2."""
result = expansion_test(test_tree, min_clade_size=2, copy=True)
expected_basic = {
"0": 1.0,
"1": 0.3,
"2": 0.8,
"3": 0.047,
"4": 1.0,
"5": 1.0,
"6": 1.0,
"7": 0.5,
"8": 0.6,
"9": 0.6,
"10": 1.0,
"11": 1.0,
"12": 1.0,
"13": 1.0,
"14": 1.0,
"15": 1.0,
"16": 0.6,
"17": 1.0,
"18": 1.0,
}
node_data = py.get.node_df(test_tree)
assert result.shape == (19, 1)
for node, expected in expected_basic.items():
actual = node_data.loc[node, "expansion_pvalue"]
assert abs(actual - expected) < 0.01, f"Basic: Node {node} expected {expected}, got {actual}"


def test_expansion_test_depth_filter(test_tree):
"""Test filtering with min_depth=3."""
expansion_test(test_tree, min_clade_size=2, min_depth=3)
expected_depth = {
"0": 1.0,
"1": 1.0,
"2": 1.0,
"3": 1.0,
"4": 1.0,
"5": 1.0,
"6": 1.0,
"7": 1.0,
"8": 0.6,
"9": 0.6,
"10": 1.0,
"11": 1.0,
"12": 1.0,
"13": 1.0,
"14": 1.0,
"15": 1.0,
"16": 0.6,
"17": 1.0,
"18": 1.0,
}
node_data = py.get.node_df(test_tree)
for node, expected in expected_depth.items():
actual = node_data.loc[node, "expansion_pvalue"]
assert abs(actual - expected) < 0.01, f"Depth filter: Node {node} expected {expected}, got {actual}"


def test_expansion_test_multiple_trees():
"""Test multiple trees."""
tree1 = nx.DiGraph()
tree1.add_edges_from([("0", "1"), ("0", "2")])
tree2 = nx.DiGraph()
tree2.add_edges_from([("A", "B"), ("A", "C")])
tdata_multi = td.TreeData(
obs=pd.DataFrame(index=["1", "2", "B", "C"]),
obst={"tree1": tree1, "tree2": tree2},
)
expansion_test(tdata_multi, min_clade_size=2)
assert "expansion_pvalue" in tdata_multi.obst["tree1"].nodes["0"]
assert "expansion_pvalue" in tdata_multi.obst["tree2"].nodes["A"]

tdata_multi2 = td.TreeData(
obs=pd.DataFrame(index=["1", "2", "3", "4", "B", "C"]),
obst={"tree1": tree1.copy(), "tree2": tree2.copy()},
)
expansion_test(tdata_multi2, min_clade_size=2, tree="tree1")
assert "expansion_pvalue" in tdata_multi2.obst["tree1"].nodes["0"]
assert "expansion_pvalue" not in tdata_multi2.obst["tree2"].nodes["A"]


def test_expansion_test_custom_key(test_tree):
"""Test using custom key_added parameter."""
expansion_test(test_tree, min_clade_size=2, key_added="custom_pvalue")
assert "custom_pvalue" in test_tree.obst["tree"].nodes["0"]
assert "expansion_pvalue" not in test_tree.obst["tree"].nodes["0"]


if __name__ == "__main__":
pytest.main(["-v", __file__])
Loading