Standardize metadata on-the-fly#
This use cases runs on a LaminDB instance with populated CellType
and Pathway
registries. Make sure you run [CellTypist] and GO Ontology notebooks before executing this use case.
Here, we demonstrate how to standardize the metadata on-the-fly during cell type annotation and pathway enrichment analysis using these two registries.
For more information, see:
!lamin load use-cases-registries
import lamindb as ln
import bionty as bt
from lamin_usecases import datasets as ds
import scanpy as sc
import matplotlib.pyplot as plt
import celltypist
import gseapy as gp
sc.settings.set_figure_params(dpi=80, facecolor="white")
ln.track()
An interferon-beta treated dataset#
A small peripheral blood mononuclear cell dataset that is split into control and stimulated groups. The stimulated group was treated with interferon beta.
Let’s load the dataset and perform some preprocessing:
adata = ds.anndata_seurat_ifnb(preprocess=False, populate_registries=True)
adata
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
sc.pp.pca(adata, n_comps=20)
sc.pp.neighbors(adata, n_pcs=10)
sc.tl.umap(adata)
Analysis: cell type annotation using CellTypist#
model = celltypist.models.Model.load(model="Immune_All_Low.pkl")
predictions = celltypist.annotate(
adata, model="Immune_All_Low.pkl", majority_voting=True
)
adata.obs["cell_type_celltypist"] = predictions.predicted_labels.majority_voting
bt.CellType.inspect(adata.obs["cell_type_celltypist"]);
adata.obs["cell_type_celltypist"] = bt.CellType.standardize(
adata.obs["cell_type_celltypist"]
)
sc.pl.umap(
adata,
color=["cell_type_celltypist", "stim"],
frameon=False,
legend_fontsize=10,
wspace=0.4,
)
Analysis: Pathway enrichment analysis using Enrichr#
This analysis is based on the GSEApy scRNA-seq Example.
First, we compute differentially expressed genes using a Wilcoxon test between stimulated and control cells.
# compute differentially expressed genes
sc.tl.rank_genes_groups(
adata,
groupby="stim",
use_raw=False,
method="wilcoxon",
groups=["STIM"],
reference="CTRL",
)
rank_genes_groups_df = sc.get.rank_genes_groups_df(adata, "STIM")
rank_genes_groups_df.head()
Next, we filter out up/down-regulated differentially expressed gene sets:
degs_up = rank_genes_groups_df[
(rank_genes_groups_df["logfoldchanges"] > 0)
& (rank_genes_groups_df["pvals_adj"] < 0.05)
]
degs_dw = rank_genes_groups_df[
(rank_genes_groups_df["logfoldchanges"] < 0)
& (rank_genes_groups_df["pvals_adj"] < 0.05)
]
degs_up.shape, degs_dw.shape
Run pathway enrichment analysis on DEGs and plot top 10 pathways:
enr_up = gp.enrichr(degs_up.names, gene_sets="GO_Biological_Process_2023").res2d
gp.dotplot(enr_up, figsize=(2, 3), title="Up", cmap=plt.cm.autumn_r);
enr_dw = gp.enrichr(degs_dw.names, gene_sets="GO_Biological_Process_2023").res2d
gp.dotplot(enr_dw, figsize=(2, 3), title="Down", cmap=plt.cm.winter_r);
Register analyzed dataset and annotate with metadata#
Register new features and labels (check out more details here):
new_features = ln.Feature.from_df(adata.obs)
ln.save(new_features)
new_labels = [ln.ULabel(name=i) for i in adata.obs["stim"].unique()]
ln.save(new_labels)
features = ln.Feature.lookup()
Register dataset using a Artifact object:
artifact = ln.Artifact.from_anndata(
adata,
description="seurat_ifnb_activated_Bcells",
field=bt.Gene.symbol,
organism=( # optionally, globally set organism via bt.settings.organism = "human"
"human"
),
)
artifact.save()
Link cell type labels#
cell_type_records = bt.CellType.from_values(adata.obs["cell_type_celltypist"])
artifact.labels.add(cell_type_records, features.cell_type_celltypist)
Link stimulation labels:
stim_records = ln.ULabel.from_values(adata.obs["stim"])
artifact.labels.add(stim_records, features.stim)
Link pathway labels#
Let’s enable tracking of the current notebook as the transform of this artifact:
We further create two feature sets for degs_up
and degs_dw
which we can later associate with the associated pathways:
degs_up_featureset = ln.FeatureSet.from_values(
degs_up.names,
bt.Gene.symbol,
name="Up-regulated DEGs STIM vs CTRL",
type="category",
organism=( # optionally, globally set organism via bt.settings.organism = "human"
"human"
),
)
degs_dw_featureset = ln.FeatureSet.from_values(
degs_dw.names,
bt.Gene.symbol,
name="Down-regulated DEGs STIM vs CTRL",
type="category",
organism=( # optionally, globally set organism via bt.settings.organism = "human"
"human"
),
)
# Link feature sets to artifact
artifact.features.add_feature_set(degs_up_featureset, slot="STIM-up-DEGs")
artifact.features.add_feature_set(degs_dw_featureset, slot="STIM-down-DEGs")
Link the top 10 pathways to the corresponding differentially expressed genes:
def parse_ontology_id_from_enrichr_results(key):
"""Parse out the ontology id.
"ATF6-mediated Unfolded Protein Response (GO:0036500)" -> ("GO:0036500", "ATF6-mediated Unfolded Protein Response")
"""
id = key.split(" ")[-1].replace("(", "").replace(")", "")
name = key.replace(f" ({id})", "")
return (id, name)
# get ontology ids for the top 10 pathways
enr_up_top10 = [
pw_id[0]
for pw_id in enr_up.head(10).Term.apply(parse_ontology_id_from_enrichr_results)
]
enr_dw_top10 = [
pw_id[0]
for pw_id in enr_dw.head(10).Term.apply(parse_ontology_id_from_enrichr_results)
]
# get pathway records
enr_up_top10_pathways = bt.Pathway.from_values(enr_up_top10, bt.Pathway.ontology_id)
enr_dw_top10_pathways = bt.Pathway.from_values(enr_dw_top10, bt.Pathway.ontology_id)
Associate the pathways to the differentially expressed genes:
degs_up_featureset.pathways.set(enr_up_top10_pathways)
degs_dw_featureset.pathways.set(enr_dw_top10_pathways)
degs_up_featureset.pathways.list("name")
Querying metadata#
artifact.describe()
Querying cell types#
Querying for cell types contains “B cell” in the name:
bt.CellType.filter(name__contains="B cell").df().head()
Querying for all artifacts annotated with a cell type:
celltypes = bt.CellType.lookup()
celltypes.tem_trm_cytotoxic_t_cells
ln.Artifact.filter(cell_types=celltypes.tem_trm_cytotoxic_t_cells).df()
Querying pathways#
Querying for pathways contains “interferon-beta” in the name:
bt.Pathway.filter(name__contains="interferon-beta").df()
Query pathways from a gene:
bt.Pathway.filter(genes__symbol="KIR2DL1").df()
Query artifacts from a pathway:
ln.Artifact.filter(feature_sets__pathways__name__icontains="interferon-beta").first()
Query featuresets from a pathway to learn from which geneset this pathway was computed:
pathway = bt.Pathway.filter(ontology_id="GO:0035456").one()
pathway
degs = ln.FeatureSet.filter(pathways__ontology_id=pathway.ontology_id).one()
Now we can get the list of genes that are differentially expressed and belong to this pathway:
contributing_genes = pathway.genes.all() & degs.genes.all()
contributing_genes.list("symbol")
# clean up test instance
!lamin delete --force use-cases-registries
!rm -r ./use-cases-registries