Add ontology term name and cell state to obs (#52)

parashardhapola · web-flow · commit 6c40ed4da755 · 2025-11-20T03:20:14.000+01:00
* add ontology term and cell state to obs

* change domain

* update docs domain

* version bump

* name fix and docs update

* Update test_main.py
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@
 
 ---
 <a href="https://colab.research.google.com/drive/1aRLsI3mx8JR8u5BKHs48YUbLsqRsh2N7?usp=sharing" target="_blank">Example Notebook</a> | 
-<a href="https://nygen-labs-prod--cytetype-api.modal.run/report/2b514924-334f-4f5c-aa25-347155586634?v=251123" target="_blank">Example output</a> | 
+<a href="https://prod.cytetype.nygen.io/report/2b514924-334f-4f5c-aa25-347155586634?v=251123" target="_blank">Example output</a> | 
 <a href="docs/examples.md">Atlas scale results</a>
 
 Switch to R/Seurat package: <a href="https://github.com/NygenAnalytics/CyteTypeR">CyteTypeR</a>
@@ -47,19 +47,25 @@ sc.pp.log1p(adata)
 sc.pp.highly_variable_genes(adata, n_top_genes=1000)
 sc.pp.pca(adata)
 sc.pp.neighbors(adata)
-sc.tl.leiden(adata, key_added="clusters")
-sc.tl.rank_genes_groups(adata, groupby="clusters", method="t-test")
+
+group_key = 'clusters'  # Wherever you want to store or already have clusters in adata.obs
+
+sc.tl.leiden(adata, key_added=group_key)
+sc.tl.umap(adata)
+sc.tl.rank_genes_groups(adata, groupby=group_key, method="t-test")
 # ------ Example Scanpy Pipeline ------
 
 # ------ CyteType ------
-annotator = CyteType(adata, group_key="clusters")
+annotator = CyteType(adata, group_key=group_key)
 adata = annotator.run(
     study_context="Brief study description (e.g., Human brain tissue ...)",
 )
 
-# View results
-print(adata.obs.cytetype_annotation_clusters)
-print(adata.obs.cytetype_cellOntologyTerm_clusters)
+# Visualize results
+sc.pl.embedding(adata, basis='umap', color=f'cytetype_annotation_{group_key}')
+sc.pl.embedding(adata, basis='umap', color=f'cytetype_cellOntologyTerm_{group_key}')
+sc.pl.embedding(adata, basis='umap', color=f'cytetype_ontologyTermID_{group_key}')
+sc.pl.embedding(adata, basis='umap', color=f'cytetype_cellState_{group_key}')
 ```
 
 ## Documentation
diff --git a/cytetype/__init__.py b/cytetype/__init__.py
@@ -1,4 +1,4 @@
 from .main import CyteType
 
 __all__ = ["CyteType"]
-__version__ = "0.10.0"
+__version__ = "0.11.0"
diff --git a/cytetype/api.py b/cytetype/api.py
@@ -157,6 +157,9 @@ def _transform_results(results_data: Dict[str, Any]) -> Dict[str, Any]:
                         "clusterId": annotation_data.get("clusterId", cluster_id),
                         "annotation": annotation_data.get("annotation", "Unknown"),
                         "ontologyTerm": annotation_data.get(
+                            "cellOntologyTermName", "Unknown"
+                        ),
+                        "ontologyTermID": annotation_data.get(
                             "cellOntologyTerm", "Unknown"
                         ),
                         # Include additional fields from new format
diff --git a/cytetype/config.py b/cytetype/config.py
@@ -11,6 +11,6 @@
 )
 
 
-DEFAULT_API_URL = "https://nygen-labs-prod--cytetype-api.modal.run"
+DEFAULT_API_URL = "https://prod.cytetype.nygen.io"
 DEFAULT_POLL_INTERVAL = 10
 DEFAULT_TIMEOUT = 7200
diff --git a/cytetype/main.py b/cytetype/main.py
@@ -218,6 +218,31 @@ def _store_results_and_annotations(
             ).astype("category")
         )
 
+        # Update ontology term IDs
+        ontology_id_map = {
+            item["clusterId"]: item["ontologyTermID"]
+            for item in result_data.get("annotations", [])
+        }
+        self.adata.obs[f"{results_prefix}_cellOntologyTermID_{self.group_key}"] = (
+            pd.Series(
+                [
+                    ontology_id_map.get(cluster_id, "Unknown")
+                    for cluster_id in self.clusters
+                ],
+                index=self.adata.obs.index,
+            ).astype("category")
+        )
+
+        # Update cell states
+        cell_state_map = {
+            item["clusterId"]: item.get("cellState", "")
+            for item in result_data.get("annotations", [])
+        }
+        self.adata.obs[f"{results_prefix}_cellState_{self.group_key}"] = pd.Series(
+            [cell_state_map.get(cluster_id, "") for cluster_id in self.clusters],
+            index=self.adata.obs.index,
+        ).astype("category")
+
         # Check for unannotated clusters if requested
         if check_unannotated:
             unannotated_clusters = set(
@@ -238,6 +263,8 @@ def _store_results_and_annotations(
         logger.success(
             f"Annotations successfully added to `adata.obs['{results_prefix}_annotation_{self.group_key}']`\n"
             f"Ontology terms added to `adata.obs['{results_prefix}_cellOntologyTerm_{self.group_key}']`\n"
+            f"Ontology term IDs added to `adata.obs['{results_prefix}_ontologyTermID_{self.group_key}']`\n"
+            f"Cell states added to `adata.obs['{results_prefix}_cellState_{self.group_key}']`\n"
             f"Full results added to `adata.uns['{results_prefix}_results']`."
         )
 
diff --git a/docs/examples.md b/docs/examples.md
@@ -5,13 +5,13 @@ The following are notebooks used to run CyteType on all the single-cell datasets
 
 | Dataset | Links |
 | --- | --- |
-| **Tabula Sapiens** | [Colab](https://colab.research.google.com/drive/1EyQXaruDJBPICUvlUY1E19zxOm_L4_VU?usp=sharing) - [CyteType Report](https://nygen-labs-prod--cytetype-api.modal.run/report/15332f10-2048-4099-ab1e-baf2ab9e39c3) - [H5ad](https://drive.google.com/file/d/1URo7niPqAo-9HGVH8f3QJfqll9lc8JN_/view?usp=drive_link) |
-| **GTEX v9** | [Colab](https://colab.research.google.com/drive/1uvqG2eVaUuNe66e0_7bp682uCdKx6-KL?usp=sharing) - [CyteType Report](https://nygen-labs-prod--cytetype-api.modal.run/report/5242f3b8-0078-417d-954e-00d1bb19bdf6) - [H5ad](https://drive.google.com/file/d/1EIpudRyasLUHR6J2v8fdpmBTbCE2__UF/view?usp=drive_link) |
-| **Hypomap** | [Colab](https://colab.research.google.com/drive/1OuTnh8xHoXaINCGcgu_1q-jANwXL8ggF?usp=sharing) - [CyteType Report](https://nygen-labs-prod--cytetype-api.modal.run/report/3840b662-bacf-4067-b93d-4e57c1f21187) - [H5ad](https://drive.google.com/file/d/1QMvZNdoDlKpOmyguAXSk45-YVz97v4tM/view?usp=drive_link) |
-| **Human Lung Cell Atlas (Core)** | [Colab](https://colab.research.google.com/drive/1FoTD-XzLNDPgYSlgVsxnLwPnWF5YiKny?usp=sharing) - [CyteType Report](https://nygen-labs-prod--cytetype-api.modal.run/report/6da1458a-392f-4bce-b6c9-4ccb308c8797) - [H5ad](https://drive.google.com/file/d/13O0dyUnwJKLPm8fncRt597S5hs2COsxx/view?usp=drive_link) |
-| **Immune Cell Atlas** | [Colab](https://colab.research.google.com/drive/1Kum9S_kU76QvS__42ABd-Xp1GpH4c9jU?usp=sharing) - [CyteType Report](https://nygen-labs-prod--cytetype-api.modal.run/report/05ff7629-8f0c-4b95-ac65-30bba9b384c5) - [H5ad](https://drive.google.com/file/d/1iqkC7dG1ovgKsU_8HdZ2eyELIxB0sM3t/view?usp=drive_link) |
-| **Mouse Pancreatic Cell Atlas** | [Colab](https://colab.research.google.com/drive/1fg9W3Lz-E_yAVoqs_6XrQsYkfsfnzFey?usp=sharing) - [CyteType Report](https://nygen-labs-prod--cytetype-api.modal.run/report/6d248cd2-6b61-4beb-bc58-1d63c7a2fc34) - [H5ad](https://drive.google.com/file/d/19qpRfz4WGuUsRNl0YKuy3YENfHKI6pz-/view?usp=drive_link) |
-| **Diabetic Kidney Disease** | [Colab](https://colab.research.google.com/drive/1kb3urFbl0PEPW4T_ti0DBTAmi5YK_-t1?usp=sharing) - [CyteType Report](https://nygen-labs-prod--cytetype-api.modal.run/report/0da4eaef-f165-4800-a4e3-c5cf8ec165ad) - [H5ad](https://drive.google.com/file/d/1yZXYlfZHLYcPL18Jy25J4v8kWQYhSsd7/view?usp=drive_link) |
+| **Tabula Sapiens** | [Colab](https://colab.research.google.com/drive/1EyQXaruDJBPICUvlUY1E19zxOm_L4_VU?usp=sharing) - [CyteType Report](https://prod.cytetype.nygen.io/report/15332f10-2048-4099-ab1e-baf2ab9e39c3) - [H5ad](https://drive.google.com/file/d/1URo7niPqAo-9HGVH8f3QJfqll9lc8JN_/view?usp=drive_link) |
+| **GTEX v9** | [Colab](https://colab.research.google.com/drive/1uvqG2eVaUuNe66e0_7bp682uCdKx6-KL?usp=sharing) - [CyteType Report](https://prod.cytetype.nygen.io/report/5242f3b8-0078-417d-954e-00d1bb19bdf6) - [H5ad](https://drive.google.com/file/d/1EIpudRyasLUHR6J2v8fdpmBTbCE2__UF/view?usp=drive_link) |
+| **Hypomap** | [Colab](https://colab.research.google.com/drive/1OuTnh8xHoXaINCGcgu_1q-jANwXL8ggF?usp=sharing) - [CyteType Report](https://prod.cytetype.nygen.io/report/3840b662-bacf-4067-b93d-4e57c1f21187) - [H5ad](https://drive.google.com/file/d/1QMvZNdoDlKpOmyguAXSk45-YVz97v4tM/view?usp=drive_link) |
+| **Human Lung Cell Atlas (Core)** | [Colab](https://colab.research.google.com/drive/1FoTD-XzLNDPgYSlgVsxnLwPnWF5YiKny?usp=sharing) - [CyteType Report](https://prod.cytetype.nygen.io/report/6da1458a-392f-4bce-b6c9-4ccb308c8797) - [H5ad](https://drive.google.com/file/d/13O0dyUnwJKLPm8fncRt597S5hs2COsxx/view?usp=drive_link) |
+| **Immune Cell Atlas** | [Colab](https://colab.research.google.com/drive/1Kum9S_kU76QvS__42ABd-Xp1GpH4c9jU?usp=sharing) - [CyteType Report](https://prod.cytetype.nygen.io/report/05ff7629-8f0c-4b95-ac65-30bba9b384c5) - [H5ad](https://drive.google.com/file/d/1iqkC7dG1ovgKsU_8HdZ2eyELIxB0sM3t/view?usp=drive_link) |
+| **Mouse Pancreatic Cell Atlas** | [Colab](https://colab.research.google.com/drive/1fg9W3Lz-E_yAVoqs_6XrQsYkfsfnzFey?usp=sharing) - [CyteType Report](https://prod.cytetype.nygen.io/report/6d248cd2-6b61-4beb-bc58-1d63c7a2fc34) - [H5ad](https://drive.google.com/file/d/19qpRfz4WGuUsRNl0YKuy3YENfHKI6pz-/view?usp=drive_link) |
+| **Diabetic Kidney Disease** | [Colab](https://colab.research.google.com/drive/1kb3urFbl0PEPW4T_ti0DBTAmi5YK_-t1?usp=sharing) - [CyteType Report](https://prod.cytetype.nygen.io/report/0da4eaef-f165-4800-a4e3-c5cf8ec165ad) - [H5ad](https://drive.google.com/file/d/1yZXYlfZHLYcPL18Jy25J4v8kWQYhSsd7/view?usp=drive_link) |
 
 ## CellHint Organ Atlases
 
@@ -20,22 +20,22 @@ Data was annotated in across three notebooks: [Colab 1/3](https://colab.research
 
 | Tissue | Links |
 | --- | --- |
-| **Blood** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/d0c219b4-2b4a-4b27-bac9-aea280a972f1) |
-| **Bone Marrow** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/bc5099b5-42c2-4ba7-8fdd-bc7b5dd3e84d) |
-| **Heart** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/2ffd6cf1-ec98-43d1-82d7-1fc3e9a11b8c) |
-| **Hippocampus** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/60b98429-2338-4408-a07c-bb60e82ac793) |
-| **Intestine** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/e0a2ca37-872f-489c-8de1-d84434d409fe) |
-| **Kidney** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/7dd1f0ea-7eec-4968-b353-8b52707de5ac) |
-| **Liver** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/a429348c-530a-486c-8980-3349a583b8c4) |
-| **Lung** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/75e41a21-f771-4ebc-829a-82f93529a147) |
-| **Lymph Node** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/b911e212-fe37-4bdc-a7f3-51e9146bf8cc) |
-| **Pancreas** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/b620245a-1ae0-4025-aab7-52ada6dcc6cb) |
-| **Skeletal Muscle** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/3f35a45d-aa1b-42cb-92d2-e739623a402b) |
-| **Spleen** | [CyteType report](https://nygen-labs-prod--cytetype-api.modal.run/report/4b64ec02-ac01-45b5-84b9-0f16708cbd85) |
+| **Blood** | [CyteType report](https://prod.cytetype.nygen.io/report/d0c219b4-2b4a-4b27-bac9-aea280a972f1) |
+| **Bone Marrow** | [CyteType report](https://prod.cytetype.nygen.io/report/bc5099b5-42c2-4ba7-8fdd-bc7b5dd3e84d) |
+| **Heart** | [CyteType report](https://prod.cytetype.nygen.io/report/2ffd6cf1-ec98-43d1-82d7-1fc3e9a11b8c) |
+| **Hippocampus** | [CyteType report](https://prod.cytetype.nygen.io/report/60b98429-2338-4408-a07c-bb60e82ac793) |
+| **Intestine** | [CyteType report](https://prod.cytetype.nygen.io/report/e0a2ca37-872f-489c-8de1-d84434d409fe) |
+| **Kidney** | [CyteType report](https://prod.cytetype.nygen.io/report/7dd1f0ea-7eec-4968-b353-8b52707de5ac) |
+| **Liver** | [CyteType report](https://prod.cytetype.nygen.io/report/a429348c-530a-486c-8980-3349a583b8c4) |
+| **Lung** | [CyteType report](https://prod.cytetype.nygen.io/report/75e41a21-f771-4ebc-829a-82f93529a147) |
+| **Lymph Node** | [CyteType report](https://prod.cytetype.nygen.io/report/b911e212-fe37-4bdc-a7f3-51e9146bf8cc) |
+| **Pancreas** | [CyteType report](https://prod.cytetype.nygen.io/report/b620245a-1ae0-4025-aab7-52ada6dcc6cb) |
+| **Skeletal Muscle** | [CyteType report](https://prod.cytetype.nygen.io/report/3f35a45d-aa1b-42cb-92d2-e739623a402b) |
+| **Spleen** | [CyteType report](https://prod.cytetype.nygen.io/report/4b64ec02-ac01-45b5-84b9-0f16708cbd85) |
 
 ## Cell Landscapes from BIS
 Cell atlases hosted by [BIS](https://bis.zju.edu.cn/) from various organims and specific tissues
 
 | Tissue | Links |
 | --- | --- |
-| Human Cell Landscape | [Colab](https://colab.research.google.com/drive/1czLW33FYbnPOmPvnfddsvehXM491UDGq?usp=sharing) - [CyteType Report](https://nygen-labs-prod--cytetype-api.modal.run/report/581616bf-3c96-4e58-a290-881b40378309) - [Homepage](https://bis.zju.edu.cn/HCL/) |
+| Human Cell Landscape | [Colab](https://colab.research.google.com/drive/1czLW33FYbnPOmPvnfddsvehXM491UDGq?usp=sharing) - [CyteType Report](https://prod.cytetype.nygen.io/report/581616bf-3c96-4e58-a290-881b40378309) - [Homepage](https://bis.zju.edu.cn/HCL/) |
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -93,16 +93,19 @@ def test_cytetype_success(
                 "clusterId": "1",
                 "annotation": "Cell Type A",
                 "ontologyTerm": "CL:0000001",
+                "ontologyTermID": "CL:0000001",
             },  # Corresponds to '0'
             {
                 "clusterId": "2",
                 "annotation": "Cell Type B",
                 "ontologyTerm": "CL:0000002",
+                "ontologyTermID": "CL:0000002",
             },  # Corresponds to '1'
             {
                 "clusterId": "3",
                 "annotation": "Cell Type C",
                 "ontologyTerm": "CL:0000003",
+                "ontologyTermID": "CL:0000003",
             },  # Corresponds to '2'
         ]
     }
@@ -265,16 +268,19 @@ def test_cytetype_with_auth_token(
                 "clusterId": "1",
                 "annotation": "Cell Type A",
                 "ontologyTerm": "CL:0000001",
+                "ontologyTermID": "CL:0000001",
             },
             {
                 "clusterId": "2",
                 "annotation": "Cell Type B",
                 "ontologyTerm": "CL:0000002",
+                "ontologyTermID": "CL:0000002",
             },
             {
                 "clusterId": "3",
                 "annotation": "Cell Type C",
                 "ontologyTerm": "CL:0000003",
+                "ontologyTermID": "CL:0000003",
             },
         ]
     }
@@ -312,6 +318,7 @@ def test_cytetype_get_results_helper(
                 "clusterId": "1",
                 "annotation": "Cell Type A",
                 "ontologyTerm": "CL:0000001",
+                "ontologyTermID": "CL:0000001",
             },
         ]
     }
@@ -362,6 +369,7 @@ def test_cytetype_with_metadata(
                 "clusterId": "1",
                 "annotation": "Cell Type A",
                 "ontologyTerm": "CL:0000001",
+                "ontologyTermID": "CL:0000001",
             },
         ]
     }
@@ -407,6 +415,7 @@ def test_cytetype_without_metadata(
                 "clusterId": "1",
                 "annotation": "Cell Type A",
                 "ontologyTerm": "CL:0000001",
+                "ontologyTermID": "CL:0000001",
             },
         ]
     }
@@ -421,6 +430,113 @@ def test_cytetype_without_metadata(
     assert "metadata" not in query_arg
 
 
+@patch("cytetype.main.submit_job")
+@patch("cytetype.main.poll_for_results")
+def test_cytetype_obs_columns(
+    mock_poll: MagicMock, mock_submit: MagicMock, mock_adata: anndata.AnnData
+) -> None:
+    """Test that all expected obs columns are created with correct names and values."""
+    job_id = "mock_job_obs_columns"
+    mock_submit.return_value = job_id
+    mock_result: dict[str, list[dict[str, str]]] = {
+        "annotations": [
+            {
+                "clusterId": "1",
+                "annotation": "T cell",
+                "ontologyTerm": "T cell",
+                "ontologyTermID": "CL:0000084",
+                "cellState": "activated",
+            },
+            {
+                "clusterId": "2",
+                "annotation": "B cell",
+                "ontologyTerm": "B cell",
+                "ontologyTermID": "CL:0000236",
+                "cellState": "naive",
+            },
+            {
+                "clusterId": "3",
+                "annotation": "Monocyte",
+                "ontologyTerm": "monocyte",
+                "ontologyTermID": "CL:0000576",
+                "cellState": "",  # Empty cell state
+            },
+        ]
+    }
+    mock_poll.return_value = mock_result
+
+    group_key = "leiden"
+    results_prefix = "cytetype"
+
+    cytetype = CyteType(mock_adata, group_key=group_key)
+    adata_result = cytetype.run(study_context="Test study context")
+
+    # Check all expected obs columns exist
+    expected_columns = [
+        f"{results_prefix}_annotation_{group_key}",
+        f"{results_prefix}_cellOntologyTerm_{group_key}",
+        f"{results_prefix}_cellOntologyTermID_{group_key}",
+        f"{results_prefix}_cellState_{group_key}",
+    ]
+
+    for col in expected_columns:
+        assert col in adata_result.obs, f"Column {col} not found in obs"
+        assert isinstance(adata_result.obs[col].dtype, pd.CategoricalDtype), (
+            f"Column {col} is not categorical"
+        )
+
+    # Check annotation values are correctly mapped
+    anno_col = f"{results_prefix}_annotation_{group_key}"
+    ct_map = {"0": "1", "1": "2", "2": "3"}  # cluster label -> cluster ID mapping
+    anno_map = {"1": "T cell", "2": "B cell", "3": "Monocyte"}
+    expected_annotations = [
+        anno_map[ct_map[str(label)]] for label in mock_adata.obs[group_key]
+    ]
+    pd.testing.assert_series_equal(
+        adata_result.obs[anno_col],
+        pd.Series(expected_annotations, index=mock_adata.obs.index, dtype="category"),
+        check_names=False,
+    )
+
+    # Check ontologyTerm values are correctly mapped
+    ontology_term_col = f"{results_prefix}_cellOntologyTerm_{group_key}"
+    ontology_term_map = {"1": "T cell", "2": "B cell", "3": "monocyte"}
+    expected_ontology_terms = [
+        ontology_term_map[ct_map[str(label)]] for label in mock_adata.obs[group_key]
+    ]
+    pd.testing.assert_series_equal(
+        adata_result.obs[ontology_term_col],
+        pd.Series(
+            expected_ontology_terms, index=mock_adata.obs.index, dtype="category"
+        ),
+        check_names=False,
+    )
+
+    # Check ontologyTermID values are correctly mapped
+    ontology_id_col = f"{results_prefix}_cellOntologyTermID_{group_key}"
+    ontology_id_map = {"1": "CL:0000084", "2": "CL:0000236", "3": "CL:0000576"}
+    expected_ontology_ids = [
+        ontology_id_map[ct_map[str(label)]] for label in mock_adata.obs[group_key]
+    ]
+    pd.testing.assert_series_equal(
+        adata_result.obs[ontology_id_col],
+        pd.Series(expected_ontology_ids, index=mock_adata.obs.index, dtype="category"),
+        check_names=False,
+    )
+
+    # Check cellState values are correctly mapped (including empty string)
+    cell_state_col = f"{results_prefix}_cellState_{group_key}"
+    cell_state_map = {"1": "activated", "2": "naive", "3": ""}
+    expected_cell_states = [
+        cell_state_map[ct_map[str(label)]] for label in mock_adata.obs[group_key]
+    ]
+    pd.testing.assert_series_equal(
+        adata_result.obs[cell_state_col],
+        pd.Series(expected_cell_states, index=mock_adata.obs.index, dtype="category"),
+        check_names=False,
+    )
+
+
 # --- TODO ---
 # - Add tests specifically for cytetype/anndata_helpers.py
 # - Add tests specifically for cytetype/client.py (e.g., more nuanced API responses)

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,6 @@`
`11`	`11`	`)`
`12`	`12`
`13`	`13`
`14`		`-DEFAULT_API_URL = "https://nygen-labs-prod--cytetype-api.modal.run"`
	`14`	`+DEFAULT_API_URL = "https://prod.cytetype.nygen.io"`
`15`	`15`	`DEFAULT_POLL_INTERVAL = 10`
`16`	`16`	`DEFAULT_TIMEOUT = 7200`