Source code for plastro.lineage_simulation

"""
Lineage simulation module for CRISPR-based lineage tracing.

This module provides functions for simulating lineage tracing data,
constructing phylogenetic trees from single-cell data, and introducing
CRISPR-based mutations to create realistic lineage relationships for 
plasticity analysis.
"""

import numpy as np
import pandas as pd
import os
from typing import Optional
from ete3 import Tree


[docs] def simulate_lineage_tracing( sim_ad: 'anndata.AnnData', terminal_ad: 'anndata.AnnData', latent_space_key: str = 'X_dc', number_of_cassettes: int = 100, save_to: Optional[str] = None ) -> 'cassiopeia.data.CassiopeiaTree': """ Simulate lineage tracing using Cassiopeia on single-cell data. Builds a phylogenetic tree from latent space coordinates and simulates CRISPR-based lineage tracing to generate character matrices. Parameters ---------- sim_ad : anndata.AnnData Complete simulated single-cell dataset. terminal_ad : anndata.AnnData Subset containing only terminal/observed cells. latent_space_key : str, optional Key in `sim_ad.obsm` containing latent space coordinates, by default 'X_dc'. number_of_cassettes : int, optional Number of mutation sites, by default 100 so we can accurately resolve lineage relationships. save_to : str, optional Directory to save tree and results, by default None. Returns ------- cassiopeia.data.CassiopeiaTree Cassiopeia tree object with character matrix and phylogenetic structure. Examples -------- >>> import plastro >>> # Assume sim_ad contains full simulated data and terminal_ad contains observed cells >>> cass_tree = plastro.simulate_lineage_tracing(sim_ad, terminal_ad, 'X_dc') >>> character_matrix = cass_tree.character_matrix >>> print(f"Character matrix shape: {character_matrix.shape}") Notes ----- This function combines tree construction from phenotypic similarity with CRISPR mutation simulation to create realistic lineage tracing data that can be used for plasticity analysis. """ # Construct initial tree from latent space tree = construct_tree(sim_ad, terminal_ad, latent_space_key=latent_space_key, save_to=save_to) # Add CRISPR mutations cass_tree = introduce_crispr_mutations(tree, number_of_cassettes=number_of_cassettes) return cass_tree
[docs] def construct_tree( sim_ad: 'anndata.AnnData', terminal_ad: 'anndata.AnnData', latent_space_key: str = 'X_dc', save_to: Optional[str] = None ) -> Tree: """ Construct phylogenetic tree from latent space coordinates. Uses neighbor-joining algorithm on distances computed from latent space to build a phylogenetic tree representing cellular relationships. Parameters ---------- sim_ad : anndata.AnnData Complete simulated dataset with latent space coordinates. terminal_ad : anndata.AnnData Terminal/observed cells to include in the tree. latent_space_key : str, optional Key for latent space coordinates in `sim_ad.obsm`, by default 'X_dc'. save_to : str, optional Directory to save the tree file, by default None. Returns ------- ete3.Tree Phylogenetic tree of the cells. Raises ------ KeyError If latent_space_key is not found in sim_ad.obsm. ValueError If no suitable root cell is found. Examples -------- >>> tree = construct_tree(sim_ad, terminal_ad, latent_space_key='X_dc') >>> print(f"Tree has {len(tree.get_leaves())} leaves") >>> tree.show() # Display tree visualization """ if latent_space_key not in sim_ad.obsm: raise KeyError(f"Latent space key '{latent_space_key}' not found in sim_ad.obsm") # Extract latent space coordinates dcs = pd.DataFrame(sim_ad.obsm[latent_space_key]) dcs.index = sim_ad.obs_names # Select random root cell from branch 'b' if available, otherwise from any branch try: root_candidates = sim_ad.obs_names[sim_ad.obs['branch_name'] == 'b'] if len(root_candidates) == 0: # Fallback to any branch if 'b' doesn't exist root_candidates = sim_ad.obs_names root = np.random.choice(root_candidates) except KeyError: # If branch_name column doesn't exist, select random cell root = np.random.choice(sim_ad.obs_names) print("Warning: 'branch_name' column not found, selecting random root") # Subset to terminal cells + root cells_to_include = list(terminal_ad.obs_names) + [root] cells_to_include = list(set(cells_to_include)) # Remove duplicates dcs = dcs.loc[cells_to_include, :] # Compute pairwise distance matrix from scipy.spatial.distance import pdist, squareform dists = pd.DataFrame(squareform(pdist(dcs, metric='euclidean'))) dists.index = dcs.index dists.columns = dcs.index # Import neighbor joining from phylo module from .phylo import neighbor_joining # Construct tree using neighbor joining tree = neighbor_joining(dists, outgroup=root) # Keep only terminal/observed cells terminal_names = list(terminal_ad.obs_names) tree.prune(terminal_names, preserve_branch_length=True) # Make tree ultrametric (equal distances from root to all leaves) tree.convert_to_ultrametric() if save_to is not None: tree_file = os.path.join(save_to, 'simulated_tree.nwk') tree.write(outfile=tree_file, format=1) print(f"Saved tree to {tree_file}") return tree
[docs] def introduce_crispr_mutations(tree: Tree, number_of_cassettes: int = 100) -> 'cassiopeia.data.CassiopeiaTree': """ Simulate CRISPR mutations on a phylogenetic tree. Uses Cassiopeia's mutation simulation to add realistic CRISPR-based lineage tracing mutations to the tree structure. Parameters ---------- tree : ete3.Tree Phylogenetic tree to add mutations to. number_of_cassettes : int, optional Number of mutation sites, by default 100 so we can accurately resolve lineage relationships. Returns ------- cassiopeia.data.CassiopeiaTree Tree with simulated character matrix containing mutation data. Examples -------- >>> from ete3 import Tree >>> tree = Tree("((A,B),C);") >>> cass_tree = introduce_crispr_mutations(tree) >>> print(cass_tree.character_matrix.shape) Notes ----- This function requires the Cassiopeia package for mutation simulation. The resulting character matrix will have cells as rows and mutation sites as columns, with values representing different mutation states. """ try: import cassiopeia as cas except ImportError: raise ImportError("Cassiopeia package required for mutation simulation. " "Install with: pip install git+https://github.com/YosefLab/Cassiopeia@master#egg=cassiopeia-lineage") # Convert ete3 tree to Cassiopeia tree newick_string = tree.write(format=1) cass_tree = cas.data.CassiopeiaTree(tree=newick_string) # Simulate mutations using Cassiopeia's mutation simulator # Parameters can be adjusted based on experimental setup mutation_simulator = cas.simulator.Cas9LineageTracingDataSimulator( number_of_cassettes=number_of_cassettes, ) # Apply mutations to the tree mutation_simulator.overlay_data(cass_tree) return cass_tree