Source code for celltypist.annotate

from . import classifier
from .models import Model
from typing import Optional, Union
import numpy as np
import pandas as pd
from anndata import AnnData
from . import logger

[docs] def annotate(filename: Union[AnnData,str] = "", model: Optional[Union[str, Model]] = None, transpose_input: bool = False, gene_file: Optional[str] = None, cell_file: Optional[str] = None, mode: str = 'best match', p_thres: float = 0.5, majority_voting: bool = False, over_clustering: Optional[Union[str, list, tuple, np.ndarray, pd.Series, pd.Index]] = None, use_GPU: bool = False, min_prop: float = 0) -> classifier.AnnotationResult: """ Run the prediction and (optional) majority voting to annotate the input dataset. Parameters ---------- filename Path to the input count matrix (supported types are csv, txt, tsv, tab and mtx) or AnnData (h5ad). If it's the former, a cell-by-gene format is desirable (see `transpose_input` for more information). Also accepts the input as an :class:`~anndata.AnnData` object already loaded in memory. Genes should be gene symbols. Non-expressed genes are preferred to be provided as well. model Model used to predict the input cells. Default to using the `'Immune_All_Low.pkl'` model. Can be a :class:`~celltypist.models.Model` object that wraps the logistic Classifier and the StandardScaler, the path to the desired model file, or the model name. To see all available models and their descriptions, use :func:`~celltypist.models.models_description`. transpose_input Whether to transpose the input matrix. Set to `True` if `filename` is provided in a gene-by-cell format. (Default: `False`) gene_file Path to the file which stores each gene per line corresponding to the genes used in the provided mtx file. Ignored if `filename` is not provided in the mtx format. cell_file Path to the file which stores each cell per line corresponding to the cells used in the provided mtx file. Ignored if `filename` is not provided in the mtx format. mode The way cell prediction is performed. For each query cell, the default (`'best match'`) is to choose the cell type with the largest score/probability as the final prediction. Setting to `'prob match'` will enable a multi-label classification, which assigns 0 (i.e., unassigned), 1, or >=2 cell type labels to each query cell. (Default: `'best match'`) p_thres Probability threshold for the multi-label classification. Ignored if `mode` is `'best match'`. (Default: 0.5) majority_voting Whether to refine the predicted labels by running the majority voting classifier after over-clustering. (Default: `False`) over_clustering This argument can be provided in several ways: 1) an input plain file with the over-clustering result of one cell per line. 2) a string key specifying an existing metadata column in the AnnData (pre-created by the user). 3) a python list, tuple, numpy array, pandas series or index representing the over-clustering result of the input cells. 4) if none of the above is provided, will use a heuristic over-clustering approach according to the size of input data. Ignored if `majority_voting` is set to `False`. use_GPU Whether to use GPU for over clustering on the basis of `rapids-singlecell`. This argument is only relevant when `majority_voting = True`. (Default: `False`) min_prop For the dominant cell type within a subcluster, the minimum proportion of cells required to support naming of the subcluster by this cell type. Ignored if `majority_voting` is set to `False`. Subcluster that fails to pass this proportion threshold will be assigned `'Heterogeneous'`. (Default: 0) Returns ---------- :class:`~celltypist.classifier.AnnotationResult` An :class:`~celltypist.classifier.AnnotationResult` object. Four important attributes within this class are: 1) :attr:`~celltypist.classifier.AnnotationResult.predicted_labels`, predicted labels from celltypist. 2) :attr:`~celltypist.classifier.AnnotationResult.decision_matrix`, decision matrix from celltypist. 3) :attr:`~celltypist.classifier.AnnotationResult.probability_matrix`, probability matrix from celltypist. 4) :attr:`~celltypist.classifier.AnnotationResult.adata`, AnnData representation of the input data. """ #load model lr_classifier = model if isinstance(model, Model) else Model.load(model) #construct Classifier class clf = classifier.Classifier(filename = filename, model = lr_classifier, transpose = transpose_input, gene_file = gene_file, cell_file = cell_file) #predict predictions = clf.celltype(mode = mode, p_thres = p_thres) if not majority_voting: return predictions if predictions.cell_count <= 50: logger.warn(f"⚠️ Warning: the input number of cells ({predictions.cell_count}) is too few to conduct proper over-clustering; no majority voting is performed") return predictions #over clustering if over_clustering is None: over_clustering = clf.over_cluster(use_GPU = use_GPU) predictions.adata = clf.adata elif isinstance(over_clustering, str): if over_clustering in clf.adata.obs: over_clustering = clf.adata.obs[over_clustering] else: logger.info(f"👀 Did not identify '{over_clustering}' as a cell metadata column, assume it to be a plain text file") try: with open(over_clustering, 'rt') as f: over_clustering = [x.strip() for x in f.readlines()] except Exception as e: raise Exception( f"🛑 {e}") if len(over_clustering) != clf.adata.n_obs: raise ValueError( f"🛑 Length of `over_clustering` ({len(over_clustering)}) does not match the number of input cells ({clf.adata.n_obs})") #majority voting return classifier.Classifier.majority_vote(predictions, over_clustering, min_prop = min_prop)