{ "cells": [ { "cell_type": "markdown", "id": "understood-farmer", "metadata": { "tags": [] }, "source": [ "# Using CellTypist for cell type classification\n", "This notebook showcases the cell type classification for scRNA-seq query data by retrieving the most likely cell type labels from either the built-in CellTypist models or the user-trained custom models." ] }, { "cell_type": "markdown", "id": "exterior-thousand", "metadata": {}, "source": [ "Only the main steps and key parameters are introduced in this notebook. Refer to detailed [Usage](https://github.com/Teichlab/celltypist#usage) if you want to learn more." ] }, { "cell_type": "markdown", "id": "assisted-clear", "metadata": {}, "source": [ "## Install CellTypist" ] }, { "cell_type": "code", "execution_count": 1, "id": "automotive-traveler", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting celltypist\n", " Using cached celltypist-1.2.0-py3-none-any.whl (5.3 MB)\n", "Requirement already satisfied: pandas>=1.0.5 in /opt/conda/lib/python3.8/site-packages (from celltypist) (1.2.3)\n", "Requirement already satisfied: click>=7.1.2 in /opt/conda/lib/python3.8/site-packages (from celltypist) (7.1.2)\n", "Requirement already satisfied: requests>=2.23.0 in /opt/conda/lib/python3.8/site-packages (from celltypist) (2.25.1)\n", "Requirement already satisfied: leidenalg>=0.8.3 in /opt/conda/lib/python3.8/site-packages (from celltypist) (0.8.3)\n", "Requirement already satisfied: scikit-learn>=0.24.1 in /opt/conda/lib/python3.8/site-packages (from celltypist) (0.24.1)\n", "Requirement already satisfied: openpyxl>=3.0.4 in /opt/conda/lib/python3.8/site-packages (from celltypist) (3.0.7)\n", "Requirement already satisfied: scanpy>=1.7.0 in /opt/conda/lib/python3.8/site-packages (from celltypist) (1.7.1)\n", "Requirement already satisfied: numpy>=1.19.0 in /opt/conda/lib/python3.8/site-packages (from celltypist) (1.20.1)\n", "Requirement already satisfied: et-xmlfile in /opt/conda/lib/python3.8/site-packages (from openpyxl>=3.0.4->celltypist) (1.0.1)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.8/site-packages (from pandas>=1.0.5->celltypist) (2.8.1)\n", "Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.8/site-packages (from pandas>=1.0.5->celltypist) (2021.1)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas>=1.0.5->celltypist) (1.15.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.23.0->celltypist) (2020.12.5)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.23.0->celltypist) (4.0.0)\n", "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.23.0->celltypist) (2.10)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.23.0->celltypist) (1.26.3)\n", "Requirement already satisfied: legacy-api-wrap in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (0.0.0)\n", "Requirement already satisfied: scipy>=1.4 in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (1.6.1)\n", "Requirement already satisfied: patsy in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (0.5.1)\n", "Requirement already satisfied: umap-learn>=0.3.10 in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (0.4.6)\n", "Requirement already satisfied: h5py>=2.10.0 in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (3.1.0)\n", "Requirement already satisfied: anndata>=0.7.4 in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (0.7.5)\n", "Requirement already satisfied: tqdm in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (4.58.0)\n", "Requirement already satisfied: natsort in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (7.1.1)\n", "Requirement already satisfied: matplotlib>=3.1.2 in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (3.3.4)\n", "Requirement already satisfied: networkx>=2.3 in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (2.5)\n", "Requirement already satisfied: tables in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (3.6.1)\n", "Requirement already satisfied: joblib in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (1.0.1)\n", "Requirement already satisfied: statsmodels>=0.10.0rc2 in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (0.12.2)\n", "Requirement already satisfied: sinfo in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (0.3.1)\n", "Requirement already satisfied: packaging in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (20.9)\n", "Requirement already satisfied: numba>=0.41.0 in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (0.51.2)\n", "Requirement already satisfied: seaborn in /opt/conda/lib/python3.8/site-packages (from scanpy>=1.7.0->celltypist) (0.11.1)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /opt/conda/lib/python3.8/site-packages (from matplotlib>=3.1.2->scanpy>=1.7.0->celltypist) (2.4.7)\n", "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.8/site-packages (from matplotlib>=3.1.2->scanpy>=1.7.0->celltypist) (8.1.2)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.8/site-packages (from matplotlib>=3.1.2->scanpy>=1.7.0->celltypist) (1.3.1)\n", "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.8/site-packages (from matplotlib>=3.1.2->scanpy>=1.7.0->celltypist) (0.10.0)\n", "Requirement already satisfied: decorator>=4.3.0 in /opt/conda/lib/python3.8/site-packages (from networkx>=2.3->scanpy>=1.7.0->celltypist) (4.4.2)\n", "Requirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in /opt/conda/lib/python3.8/site-packages (from numba>=0.41.0->scanpy>=1.7.0->celltypist) (0.34.0)\n", "Requirement already satisfied: setuptools in /opt/conda/lib/python3.8/site-packages (from numba>=0.41.0->scanpy>=1.7.0->celltypist) (49.6.0.post20210108)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from scikit-learn>=0.24.1->celltypist) (2.1.0)\n", "Requirement already satisfied: get-version>=2.0.4 in /opt/conda/lib/python3.8/site-packages (from legacy-api-wrap->scanpy>=1.7.0->celltypist) (2.1)\n", "Requirement already satisfied: stdlib-list in /opt/conda/lib/python3.8/site-packages (from sinfo->scanpy>=1.7.0->celltypist) (0.7.0)\n", "Requirement already satisfied: numexpr>=2.6.2 in /opt/conda/lib/python3.8/site-packages (from tables->scanpy>=1.7.0->celltypist) (2.7.3)\n", "Installing collected packages: celltypist\n", "Successfully installed celltypist-1.2.0\n" ] } ], "source": [ "!pip install celltypist" ] }, { "cell_type": "code", "execution_count": 2, "id": "organized-wedding", "metadata": {}, "outputs": [], "source": [ "import scanpy as sc" ] }, { "cell_type": "code", "execution_count": 3, "id": "intelligent-standard", "metadata": {}, "outputs": [], "source": [ "import celltypist\n", "from celltypist import models" ] }, { "cell_type": "markdown", "id": "julian-banana", "metadata": {}, "source": [ "## Download a scRNA-seq dataset of 2,000 immune cells" ] }, { "cell_type": "code", "execution_count": 4, "id": "naval-seminar", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c469cfcd41b541659147f8bf0f362444", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0.00/34.1M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "adata_2000 = sc.read('celltypist_demo_folder/demo_2000_cells.h5ad', backup_url = 'https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_2000_cells.h5ad')" ] }, { "cell_type": "markdown", "id": "passive-classics", "metadata": {}, "source": [ "This dataset includes 2,000 cells and 18,950 genes collected from different studies, thereby showing the practical applicability of CellTypist." ] }, { "cell_type": "code", "execution_count": 5, "id": "sensitive-chancellor", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2000, 18950)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_2000.shape" ] }, { "cell_type": "markdown", "id": "destroyed-distribution", "metadata": { "tags": [] }, "source": [ "The expression matrix (`adata_2000.X`) is pre-processed (and required) as log1p normalised expression to 10,000 counts per cell (this matrix can be alternatively stashed in `.raw.X`)." ] }, { "cell_type": "code", "execution_count": 6, "id": "sized-grocery", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "matrix([[10000. ],\n", " [10000.002],\n", " [10000. ],\n", " ...,\n", " [10000. ],\n", " [10000. ],\n", " [10000. ]], dtype=float32)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_2000.X.expm1().sum(axis = 1)" ] }, { "cell_type": "markdown", "id": "approximate-excerpt", "metadata": {}, "source": [ "Some pre-assigned cell type labels are also in the data, which will be compared to the predicted labels from CellTypist later." ] }, { "cell_type": "code", "execution_count": 7, "id": "downtown-arbitration", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
| \n", " | cell_type | \n", "
|---|---|
| cell1 | \n", "Plasma cells | \n", "
| cell2 | \n", "Plasma cells | \n", "
| cell3 | \n", "Plasma cells | \n", "
| cell4 | \n", "Plasma cells | \n", "
| cell5 | \n", "Plasma cells | \n", "
| ... | \n", "... | \n", "
| cell1996 | \n", "Neutrophil-myeloid progenitor | \n", "
| cell1997 | \n", "Neutrophil-myeloid progenitor | \n", "
| cell1998 | \n", "Neutrophil-myeloid progenitor | \n", "
| cell1999 | \n", "Neutrophil-myeloid progenitor | \n", "
| cell2000 | \n", "Neutrophil-myeloid progenitor | \n", "
2000 rows × 1 columns
\n", "| \n", " | model | \n", "description | \n", "
|---|---|---|
| 0 | \n", "Immune_All_Low.pkl | \n", "immune sub-populations combined from 20 tissue... | \n", "
| 1 | \n", "Immune_All_High.pkl | \n", "immune populations combined from 20 tissues of... | \n", "
| 2 | \n", "Adult_Mouse_Gut.pkl | \n", "cell types in the adult mouse gut combined fro... | \n", "
| 3 | \n", "COVID19_Immune_Landscape.pkl | \n", "immune subtypes from lung and blood of COVID-1... | \n", "
| 4 | \n", "Cells_Fetal_Lung.pkl | \n", "cell types from human embryonic and fetal lungs | \n", "
| 5 | \n", "Cells_Intestinal_Tract.pkl | \n", "intestinal cells from fetal, pediatric and adu... | \n", "
| 6 | \n", "Cells_Lung_Airway.pkl | \n", "cell populations from scRNA-seq of five locati... | \n", "
| 7 | \n", "Developing_Mouse_Brain.pkl | \n", "cell types from the embryonic mouse brain betw... | \n", "
| 8 | \n", "Healthy_COVID19_PBMC.pkl | \n", "peripheral blood mononuclear cell types from h... | \n", "
| 9 | \n", "Human_Lung_Atlas.pkl | \n", "integrated Human Lung Cell Atlas (HLCA) combin... | \n", "
| 10 | \n", "Nuclei_Lung_Airway.pkl | \n", "cell populations from snRNA-seq of five locati... | \n", "
| 11 | \n", "Pan_Fetal_Human.pkl | \n", "stromal and immune populations from the human ... | \n", "
| \n", " | predicted_labels | \n", "over_clustering | \n", "majority_voting | \n", "
|---|---|---|---|
| cell1 | \n", "Plasma cells | \n", "44 | \n", "Plasma cells | \n", "
| cell2 | \n", "Plasma cells | \n", "12 | \n", "Plasma cells | \n", "
| cell3 | \n", "Plasma cells | \n", "36 | \n", "gamma-delta T cells | \n", "
| cell4 | \n", "Plasma cells | \n", "1 | \n", "Plasma cells | \n", "
| cell5 | \n", "Plasma cells | \n", "1 | \n", "Plasma cells | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| cell1996 | \n", "HSC/MPP | \n", "9 | \n", "Neutrophil-myeloid progenitor | \n", "
| cell1997 | \n", "Neutrophil-myeloid progenitor | \n", "27 | \n", "Neutrophil-myeloid progenitor | \n", "
| cell1998 | \n", "Neutrophil-myeloid progenitor | \n", "28 | \n", "Neutrophil-myeloid progenitor | \n", "
| cell1999 | \n", "Neutrophil-myeloid progenitor | \n", "27 | \n", "Neutrophil-myeloid progenitor | \n", "
| cell2000 | \n", "Neutrophil-myeloid progenitor | \n", "9 | \n", "Neutrophil-myeloid progenitor | \n", "
2000 rows × 3 columns
\n", "| \n", " | cell_type | \n", "predicted_labels | \n", "over_clustering | \n", "majority_voting | \n", "conf_score | \n", "
|---|---|---|---|---|---|
| cell1 | \n", "Plasma cells | \n", "Plasma cells | \n", "44 | \n", "Plasma cells | \n", "0.999762 | \n", "
| cell2 | \n", "Plasma cells | \n", "Plasma cells | \n", "12 | \n", "Plasma cells | \n", "0.999926 | \n", "
| cell3 | \n", "Plasma cells | \n", "Plasma cells | \n", "36 | \n", "gamma-delta T cells | \n", "0.955991 | \n", "
| cell4 | \n", "Plasma cells | \n", "Plasma cells | \n", "1 | \n", "Plasma cells | \n", "0.999883 | \n", "
| cell5 | \n", "Plasma cells | \n", "Plasma cells | \n", "1 | \n", "Plasma cells | \n", "0.999890 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| cell1996 | \n", "Neutrophil-myeloid progenitor | \n", "HSC/MPP | \n", "9 | \n", "Neutrophil-myeloid progenitor | \n", "0.152962 | \n", "
| cell1997 | \n", "Neutrophil-myeloid progenitor | \n", "Neutrophil-myeloid progenitor | \n", "27 | \n", "Neutrophil-myeloid progenitor | \n", "0.810408 | \n", "
| cell1998 | \n", "Neutrophil-myeloid progenitor | \n", "Neutrophil-myeloid progenitor | \n", "28 | \n", "Neutrophil-myeloid progenitor | \n", "0.961021 | \n", "
| cell1999 | \n", "Neutrophil-myeloid progenitor | \n", "Neutrophil-myeloid progenitor | \n", "27 | \n", "Neutrophil-myeloid progenitor | \n", "0.131777 | \n", "
| cell2000 | \n", "Neutrophil-myeloid progenitor | \n", "Neutrophil-myeloid progenitor | \n", "9 | \n", "Neutrophil-myeloid progenitor | \n", "0.985607 | \n", "
2000 rows × 5 columns
\n", "