YosefLab · isabellechan089 · Nov 13, 2025 · Nov 5, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,12 +11,15 @@ and this project adheres to [Semantic Versioning][].
 ## [Unrealeased]
 
 ### Added
+- Added `pycea.tl.expansion_test` for computing expansion p-values to detect clades under selection
 
 - `pycea.tl.partition_test` to test for statistically significant differences between leaf partitions. (#40)
+- `pycea.tl.expansion_test` for computing expansion p-values to detect expanding clades. (#38)
 
 ### Changed
 
 - Replaced `tdata.obs_keys()` with `tdata.obs.keys()` to conform with anndata API changes. (#41)
+- `pycea.tl.fitness` no longer returns a multi-indexed DataFrame when `tdata` contains a single tree. (#38)
 
 ### Fixed
 

diff --git a/docs/api.md b/docs/api.md
@@ -33,6 +33,7 @@
     tl.n_extant
     tl.fitness
     tl.partition_test
+    tl.expansion_test
 ```
 
 ## Plotting

diff --git a/docs/notebooks/growth-dynamics.ipynb b/docs/notebooks/growth-dynamics.ipynb
diff --git a/docs/references.bib b/docs/references.bib
@@ -83,3 +83,22 @@ @article {Neher_2014
     issn = {2050-084X},
     publisher = {eLife Sciences Publications, Ltd},
 }
+
+@article{Griffiths_1998,
+	title = {The age of a mutation in a general coalescent tree},
+	volume = {14},
+	issn = {0882-0287},
+	url = {https://doi.org/10.1080/15326349808807471},
+	doi = {10.1080/15326349808807471},
+	abstract = {Kimura and Ohta showed that the expected age of a neutral mutation observed to be of frequency x in a population is We put this classical result in a general coalescent process context that allows questions to be asked about mutations in a sample, as well as in the population. In the general context the population size may vary back in time. Assuming an infmitely-many-sites model of mutation, we find the distribution of the number of mutant genes at a particular site in a sample; the probability that an allele at that site of a given frequency is ancestral; the distribution of the age of a mutation given its frequency in a sample, or population; and the distribution of the time to the most recent common ancestor, given the frequency of a mutation in a sample, or in the population},
+	number = {1-2},
+	urldate = {2025-11-13},
+	journal = {Communications in Statistics. Stochastic Models},
+	author = {Griffiths, R.C. and Tavaré, Simon},
+	month = jan,
+	year = {1998},
+	note = {Publisher: Taylor \& Francis
+\_eprint: https://doi.org/10.1080/15326349808807471},
+	keywords = {Age of a mutation, Coalescent process, Population Genetics, Samples of DNA},
+	pages = {273--295},
+}
diff --git a/src/pycea/tl/__init__.py b/src/pycea/tl/__init__.py
@@ -7,5 +7,6 @@
 from .neighbor_distance import neighbor_distance
 from .partition_test import partition_test
 from .sort import sort
+from .topology import expansion_test
 from .tree_distance import tree_distance
 from .tree_neighbors import tree_neighbors
diff --git a/src/pycea/tl/ancestral_states.py b/src/pycea/tl/ancestral_states.py
@@ -353,4 +353,3 @@ def ancestral_states(
                 _ancestral_states(t, key_added, method, costs, missing_state, default_state)
     if copy:
         return get_keyed_node_data(tdata, keys_added, tree_keys, slot="obst")
-        return get_keyed_node_data(tdata, keys_added, tree_keys)
diff --git a/src/pycea/tl/clades.py b/src/pycea/tl/clades.py
@@ -185,4 +185,3 @@ def clades(
     tdata.obs[key_added] = tdata.obs.index.map(node_to_clade[key_added])
     if copy:
         return pd.concat(lcas)
-        return pd.concat(lcas)
diff --git a/src/pycea/tl/fitness.py b/src/pycea/tl/fitness.py
@@ -531,6 +531,8 @@ def fitness(
     leaf_fitness = get_keyed_leaf_data(tdata, key_added, tree_keys)
     tdata.obs[key_added] = tdata.obs.index.map(leaf_fitness[key_added])
     if copy:
-        return get_keyed_node_data(tdata, key_added, tree_keys)
-    return None
+        df = get_keyed_node_data(tdata, key_added, tree_keys)
+        if len(trees) == 1:
+            df.index = df.index.droplevel(0)
+        return df
     return None
diff --git a/src/pycea/tl/topology.py b/src/pycea/tl/topology.py
@@ -0,0 +1,128 @@
+from collections.abc import Sequence
+from typing import Literal, overload
+
+import networkx as nx
+import pandas as pd
+import treedata as td
+from scipy.special import comb as nCk
+
+from pycea.utils import get_keyed_node_data, get_root, get_trees
+
+
+@overload
+def expansion_test(
+    tdata: td.TreeData,
+    tree: str | Sequence[str] | None = None,
+    min_clade_size: int = 10,
+    min_depth: int = 1,
+    key_added: str = "expansion_pvalue",
+    copy: Literal[True, False] = True,
+) -> pd.DataFrame: ...
+@overload
+def expansion_test(
+    tdata: td.TreeData,
+    tree: str | Sequence[str] | None = None,
+    min_clade_size: int = 10,
+    min_depth: int = 1,
+    key_added: str = "expansion_pvalue",
+    copy: Literal[True, False] = False,
+) -> None: ...
+def expansion_test(
+    tdata: td.TreeData,
+    tree: str | Sequence[str] | None = None,
+    min_clade_size: int = 10,
+    min_depth: int = 1,
+    key_added: str = "expansion_pvalue",
+    copy: Literal[True, False] = False,
+) -> pd.DataFrame | None:
+    """Compute expansion p-values on a tree.
+
+    Uses the methodology described in :cite:`Yang_2022` to
+    assess the expansion probability of a given subclade of a phylogeny.
+    Mathematical treatment of the coalescent probability is described in :cite:`Griffiths_1998`.
+
+    The probability computed corresponds to the probability that, under a simple
+    neutral coalescent model, a given subclade contains the observed number of
+    cells; in other words, a one-sided p-value. Often, if the probability is
+    less than some threshold (e.g., 0.05), this might indicate that there exists
+    some subclade under this node to which this expansion probability can be
+    attributed (i.e. the null hypothesis that the subclade is undergoing
+    neutral drift can be rejected).
+
+    This function will add an attribute to tree nodes storing the expansion p-value.
+
+    On a typical balanced tree, this function performs in O(n) time.
+
+    Parameters
+    ----------
+    tdata
+        TreeData object containing a phylogenetic tree.
+    min_clade_size
+        Minimum number of leaves in a subtree to be considered. Default is 10.
+    min_depth
+        Minimum depth of clade to be considered. Depth is measured in number
+        of nodes from the root, not branch lengths. Default is 1.
+    tree
+        The `obst` key or keys of the trees to use. If `None`, all trees are used.
+    key_added
+        Attribute key where expansion p-values will be stored in tree nodes.
+        Default is "expansion_pvalue".
+    copy
+        If True, return a copy of the TreeData with attributes added.
+        If False, modify in place and return None. Default is False.
+
+    Returns
+    -------
+    Returns `None` if ``copy=False``, otherwise returns a :class:`pandas.DataFrame` with expansion pvalues.
+
+    Sets the following fields:
+
+    * tdata.obst[tree].nodes[key_added] : `float`
+        - Expansion pvalue for each node.
+    """
+    trees = get_trees(tdata, tree)
+
+    for _tree_key, t in trees.items():
+        root = get_root(t)
+        # instantiate attributes
+        leaf_counts = {}
+        for node in nx.dfs_postorder_nodes(t, root):
+            if t.out_degree(node) == 0:
+                leaf_counts[node] = 1
+            else:
+                leaf_counts[node] = sum(leaf_counts[child] for child in t.successors(node))
+
+        depths = {root: 0}
+        for u, v in nx.dfs_edges(t, root):
+            depths[v] = depths[u] + 1
+
+        nx.set_node_attributes(t, 1.0, key_added)
+
+        for node in t.nodes():
+            n = leaf_counts[node]
+            children = list(t.successors(node))
+            k = len(children)
+
+            if k == 0:
+                continue
+
+            for child in children:
+                b = leaf_counts[child]
+                depth = depths[child]
+
+                # Apply filters
+                if b < min_clade_size:
+                    continue
+                if depth < min_depth:
+                    continue
+
+                p = nCk(n - b, k - 1) / nCk(n - 1, k - 1)
+                t.nodes[child][key_added] = float(p)
+
+    if copy:
+        df = get_keyed_node_data(tdata, keys=key_added, tree=tree, slot="obst")
+        if len(trees) == 1:
+            df.index = df.index.droplevel(0)
+        return df
+    else:
+        return None
diff --git a/tests/test_fitness.py b/tests/test_fitness.py
@@ -27,10 +27,10 @@ def test_fitness_lbi_copy(tdata):
     out = fitness(tdata, method="lbi", copy=True, random_state=42)
     assert isinstance(out, pd.DataFrame)
     assert "fitness" in out.columns
-    assert out.loc[("tree", "B"), "fitness"] == pytest.approx(1.120, abs=1e-3)
+    assert out.loc["B", "fitness"] == pytest.approx(1.120, abs=1e-3)
     tree = tdata.obst["tree"]
     assert all("fitness" in tree.nodes[n] for n in tree.nodes)
-    assert set(out.droplevel(0).index) == set(tree.nodes)
+    assert set(out.index) == set(tree.nodes)
 
 
 def test_bad_input(tdata):

diff --git a/tests/test_topology.py b/tests/test_topology.py
@@ -0,0 +1,146 @@
+import networkx as nx
+import pandas as pd
+import pytest
+import treedata as td
+
+import pycea as py
+from pycea.tl.topology import expansion_test
+
+
+@pytest.fixture
+def test_tree():
+    """Create a test TreeData object with a tree topology."""
+    # Create tree topology
+    tree = nx.DiGraph()
+    tree.add_edges_from(
+        [
+            ("0", "1"),
+            ("0", "2"),
+            ("1", "3"),
+            ("1", "4"),
+            ("1", "5"),
+            ("2", "6"),
+            ("2", "7"),
+            ("3", "8"),
+            ("3", "9"),
+            ("3", "16"),
+            ("7", "10"),
+            ("7", "11"),
+            ("8", "12"),
+            ("8", "13"),
+            ("9", "14"),
+            ("9", "15"),
+            ("16", "17"),
+            ("16", "18"),
+        ]
+    )
+
+    # Create TreeData object
+    tdata = td.TreeData(
+        obs=pd.DataFrame(index=["4", "5", "6", "10", "11", "12", "13", "14", "15", "17", "18"]),
+        obst={"tree": tree},
+    )
+
+    return tdata
+
+
+def test_expansion_test_min_clade(test_tree):
+    """Test that min_clade_size=20 filters out all clades."""
+    expansion_test(test_tree, min_clade_size=20)
+    node_data = py.get.node_df(test_tree)
+    assert (node_data["expansion_pvalue"] == 1.0).all(), "All nodes should be filtered with min_clade_size=20"
+
+
+def test_expansion_test_basic(test_tree):
+    """Test expansion p-values with min_clade_size=2."""
+    result = expansion_test(test_tree, min_clade_size=2, copy=True)
+    expected_basic = {
+        "0": 1.0,
+        "1": 0.3,
+        "2": 0.8,
+        "3": 0.047,
+        "4": 1.0,
+        "5": 1.0,
+        "6": 1.0,
+        "7": 0.5,
+        "8": 0.6,
+        "9": 0.6,
+        "10": 1.0,
+        "11": 1.0,
+        "12": 1.0,
+        "13": 1.0,
+        "14": 1.0,
+        "15": 1.0,
+        "16": 0.6,
+        "17": 1.0,
+        "18": 1.0,
+    }
+    node_data = py.get.node_df(test_tree)
+    assert result.shape == (19, 1)
+    for node, expected in expected_basic.items():
+        actual = node_data.loc[node, "expansion_pvalue"]
+        assert abs(actual - expected) < 0.01, f"Basic: Node {node} expected {expected}, got {actual}"
+
+
+def test_expansion_test_depth_filter(test_tree):
+    """Test filtering with min_depth=3."""
+    expansion_test(test_tree, min_clade_size=2, min_depth=3)
+    expected_depth = {
+        "0": 1.0,
+        "1": 1.0,
+        "2": 1.0,
+        "3": 1.0,
+        "4": 1.0,
+        "5": 1.0,
+        "6": 1.0,
+        "7": 1.0,
+        "8": 0.6,
+        "9": 0.6,
+        "10": 1.0,
+        "11": 1.0,
+        "12": 1.0,
+        "13": 1.0,
+        "14": 1.0,
+        "15": 1.0,
+        "16": 0.6,
+        "17": 1.0,
+        "18": 1.0,
+    }
+    node_data = py.get.node_df(test_tree)
+    for node, expected in expected_depth.items():
+        actual = node_data.loc[node, "expansion_pvalue"]
+        assert abs(actual - expected) < 0.01, f"Depth filter: Node {node} expected {expected}, got {actual}"
+
+
+def test_expansion_test_multiple_trees():
+    """Test multiple trees."""
+    tree1 = nx.DiGraph()
+    tree1.add_edges_from([("0", "1"), ("0", "2")])
+    tree2 = nx.DiGraph()
+    tree2.add_edges_from([("A", "B"), ("A", "C")])
+    tdata_multi = td.TreeData(
+        obs=pd.DataFrame(index=["1", "2", "B", "C"]),
+        obst={"tree1": tree1, "tree2": tree2},
+    )
+    expansion_test(tdata_multi, min_clade_size=2)
+    assert "expansion_pvalue" in tdata_multi.obst["tree1"].nodes["0"]
+    assert "expansion_pvalue" in tdata_multi.obst["tree2"].nodes["A"]
+
+    tdata_multi2 = td.TreeData(
+        obs=pd.DataFrame(index=["1", "2", "3", "4", "B", "C"]),
+        obst={"tree1": tree1.copy(), "tree2": tree2.copy()},
+    )
+    expansion_test(tdata_multi2, min_clade_size=2, tree="tree1")
+    assert "expansion_pvalue" in tdata_multi2.obst["tree1"].nodes["0"]
+    assert "expansion_pvalue" not in tdata_multi2.obst["tree2"].nodes["A"]
+
+
+def test_expansion_test_custom_key(test_tree):
+    """Test using custom key_added parameter."""
+    expansion_test(test_tree, min_clade_size=2, key_added="custom_pvalue")
+    assert "custom_pvalue" in test_tree.obst["tree"].nodes["0"]
+    assert "expansion_pvalue" not in test_tree.obst["tree"].nodes["0"]
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])