Skip to content

Commit d9b0093

Browse files
committed
Updated metadata extractor
1 parent f3948e8 commit d9b0093

File tree

4 files changed

+248
-30
lines changed

4 files changed

+248
-30
lines changed

bedboss/bbuploader/metadata_extractor.py

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"h1-hesc": "H1-hESC",
4242
"h1975": "H1975",
4343
"hap1": "HAP1",
44+
"hepg2": "HepG2",
4445
"hcc1954": "HCC1954",
4546
"hcc3153": "HCC3153",
4647
"hek293": "HEK293",
@@ -67,6 +68,7 @@
6768
"lncap": "LNCaP",
6869
"loucy": "Loucy",
6970
"mcf-7": "MCF-7",
71+
"mcf7": "MCF-7",
7072
"mcf10a": "MCF10A",
7173
"mda-mb-231": "MDA-MB-231",
7274
"mda-mb-453": "MDA-MB-453",
@@ -114,25 +116,25 @@
114116
# ---- TF & Histone ChIP (most specific first) ----
115117
"tf chip-seq": "TF ChIP-seq",
116118
"histone chip-seq": "Histone ChIP-seq",
117-
"h3k27ac": "H3K27ac",
118-
"h3k27me3": "H3K27me3",
119-
"h3k9me2": "H3K9me2",
120-
"h3k9me3": "H3K9me3",
121-
"h3k4me3": "H3K4me3",
122-
"h3k4me2": "H3K4me2",
123-
"h3k4me1": "H3K4me1",
124-
"h3k36me3": "H3K36me3",
125-
"h3k79me2": "H3K79me2",
126-
"h3k79me3": "H3K79me3",
127-
"h3k27me1": "H3K27me1",
128-
"h3k9ac": "H3K9ac",
129-
"h3k14ac": "H3K14ac",
130-
"h3k18ac": "H3K18ac",
131-
"h3k23ac": "H3K23ac",
132-
"h4k20me1": "H4K20me1",
133-
"h4k20me3": "H4K20me3",
134-
"h2ak119ub": "H2AK119ub",
135-
"h2bk120ub": "H2BK120ub",
119+
# "h3k27ac": "H3K27ac",
120+
# "h3k27me3": "H3K27me3",
121+
# "h3k9me2": "H3K9me2",
122+
# "h3k9me3": "H3K9me3",
123+
# "h3k4me3": "H3K4me3",
124+
# "h3k4me2": "H3K4me2",
125+
# "h3k4me1": "H3K4me1",
126+
# "h3k36me3": "H3K36me3",
127+
# "h3k79me2": "H3K79me2",
128+
# "h3k79me3": "H3K79me3",
129+
# "h3k27me1": "H3K27me1",
130+
# "h3k9ac": "H3K9ac",
131+
# "h3k14ac": "H3K14ac",
132+
# "h3k18ac": "H3K18ac",
133+
# "h3k23ac": "H3K23ac",
134+
# "h4k20me1": "H4K20me1",
135+
# "h4k20me3": "H4K20me3",
136+
# "h2ak119ub": "H2AK119ub",
137+
# "h2bk120ub": "H2BK120ub",
136138
"biotin chip-seq": "Biotin ChIP-seq",
137139
# ---- Generic ChIP ----
138140
"chip-seq": "ChIP-seq",
@@ -175,13 +177,37 @@
175177
"bruuvseq": "bruUVseq",
176178
"bruseq": "BruSeq",
177179
"selex": "SELEX",
180+
"starr-seq": "STARR-seq",
178181
"microrna counts": "microRNA counts",
179182
"mitoperturb-seq": "MitoPerturb-Seq",
180183
# ---- Fallback ----
181184
"other": "OTHER",
182185
}
183186

184187

188+
antibody = [
189+
"H3K27ac",
190+
"H3K27me3",
191+
"H3K9me2",
192+
"H3K9me3",
193+
"H3K4me3",
194+
"H3K4me2",
195+
"H3K4me1",
196+
"H3K36me3",
197+
"H3K79me2",
198+
"H3K79me3",
199+
"H3K27me1",
200+
"H3K9ac",
201+
"H3K14ac",
202+
"H3K18ac",
203+
"H3K23ac",
204+
"H4K20me1",
205+
"H4K20me3",
206+
"H2AK119ub",
207+
"H2BK120ub",
208+
]
209+
210+
185211
def standardize_cell_line(cell_line: str) -> str:
186212
"""
187213
Standardize cell line names to a consistent format.

bedboss/scripts/make_umap.py

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -319,26 +319,57 @@ def get_embeddings(
319319

320320
return_df = merged.copy()
321321

322+
##############################################################
323+
######## Option 1 ############################################
324+
######### Remove empty/None cell lines and assays ############
325+
##############################################################
326+
327+
# # Select top cell lines available in the dataset
328+
# if top_cell_lines is not None:
329+
# top_cell_lines_list = [
330+
# x
331+
# for x in merged[CELL_LINE].value_counts().nlargest(top_cell_lines).index
332+
# if x is not None and x != ""
333+
# ]
334+
#
335+
# return_df = return_df[return_df[CELL_LINE].isin(top_cell_lines_list)]
336+
#
337+
# # Select top assays available in the dataset
338+
# if top_assays is not None:
339+
# top_assays_list = [
340+
# x
341+
# for x in merged[ASSAY].value_counts().nlargest(top_assays).index
342+
# if x is not None and x != ""
343+
# ]
344+
#
345+
# return_df = return_df[return_df[ASSAY].isin(top_assays_list)]
346+
347+
################################################################################
348+
######################### Option 2 #############################################
349+
## Label empty/None cell lines and assays as "na" instead of removing them #####
350+
################################################################################
351+
na_name = "UNKNOWN"
352+
353+
return_df[CELL_LINE] = return_df[CELL_LINE].fillna(na_name).replace("", na_name)
354+
return_df[ASSAY] = return_df[ASSAY].fillna(na_name).replace("", na_name)
355+
322356
# Select top cell lines available in the dataset
323357
if top_cell_lines is not None:
324-
top_cell_lines_list = [
325-
x
326-
for x in merged[CELL_LINE].value_counts().nlargest(top_cell_lines).index
327-
if x is not None and x != ""
328-
]
358+
top_cell_lines_list = list(
359+
return_df[CELL_LINE].value_counts().nlargest(top_cell_lines).index
360+
)
329361

330362
return_df = return_df[return_df[CELL_LINE].isin(top_cell_lines_list)]
331363

332364
# Select top assays available in the dataset
333365
if top_assays is not None:
334-
top_assays_list = [
335-
x
336-
for x in merged[ASSAY].value_counts().nlargest(top_assays).index
337-
if x is not None and x != ""
338-
]
339-
366+
top_assays_list = list(
367+
return_df[ASSAY].value_counts().nlargest(top_assays).index
368+
)
340369
return_df = return_df[return_df[ASSAY].isin(top_assays_list)]
341370

371+
###############################################################################
372+
342373
umap_return = create_umap(
343374
return_df,
344375
n_components=n_components,

scripts/update_assay.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
Backfill missing assay values in bed_metadata table.
3+
4+
Targets records where assay is NULL, empty string, or "OTHER".
5+
6+
Usage:
7+
python scripts/update_assay.py <config.yaml>
8+
python scripts/update_assay.py <config.yaml> --dry-run
9+
"""
10+
11+
import argparse
12+
import logging
13+
14+
from sqlalchemy.orm import Session
15+
from sqlalchemy import select, or_
16+
17+
from bbconf.bbagent import BedBaseAgent
18+
from bbconf.db_utils import Bed, BedMetadata
19+
from bedboss.bbuploader.metadata_extractor import find_assay
20+
21+
logging.basicConfig(level=logging.INFO)
22+
logger = logging.getLogger(__name__)
23+
24+
BATCH_SIZE = 500
25+
26+
27+
def main():
28+
parser = argparse.ArgumentParser(
29+
description="Backfill missing assay in bed_metadata"
30+
)
31+
parser.add_argument("config", help="Path to bbconf YAML config file")
32+
parser.add_argument(
33+
"--dry-run",
34+
action="store_true",
35+
help="Preview changes without writing to the database",
36+
)
37+
args = parser.parse_args()
38+
39+
agent = BedBaseAgent(config=args.config)
40+
41+
with Session(agent.config.db_engine.engine) as session:
42+
stmt = (
43+
select(Bed)
44+
.join(BedMetadata)
45+
.where(
46+
or_(
47+
BedMetadata.assay == "",
48+
BedMetadata.assay.is_(None),
49+
BedMetadata.assay == "OTHER",
50+
)
51+
)
52+
)
53+
beds = session.scalars(stmt).all()
54+
total = len(beds)
55+
logger.info(f"Found {total} records with missing/OTHER assay")
56+
57+
updated = 0
58+
for i, bed in enumerate(beds, 1):
59+
parts = [bed.description, bed.annotations.original_file_name, bed.name]
60+
combined = " ".join(p for p in parts if p)
61+
assay = find_assay(combined)
62+
63+
if assay:
64+
logger.info(f" [{bed.id}] '{combined[:80]}...' -> assay='{assay}'")
65+
if not args.dry_run:
66+
bed.annotations.assay = assay
67+
updated += 1
68+
69+
if not args.dry_run and i % BATCH_SIZE == 0:
70+
session.commit()
71+
logger.info(f" Committed batch ({i}/{total} processed so far)")
72+
73+
if not args.dry_run:
74+
session.commit()
75+
logger.info(f"Committed final batch. Updated {updated}/{total} records.")
76+
else:
77+
logger.info(f"[DRY RUN] Would update {updated}/{total} records")
78+
79+
80+
if __name__ == "__main__":
81+
main()

scripts/update_cell_line.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
Backfill missing cell_line values in bed_metadata table.
3+
4+
Usage:
5+
python scripts/update_cell_line.py <config.yaml>
6+
python scripts/update_cell_line.py <config.yaml> --dry-run
7+
"""
8+
9+
import argparse
10+
import logging
11+
12+
from sqlalchemy.orm import Session
13+
from sqlalchemy import select, or_
14+
15+
from bbconf.bbagent import BedBaseAgent
16+
from bbconf.db_utils import Bed, BedMetadata
17+
from bedboss.bbuploader.metadata_extractor import find_cell_line
18+
19+
logging.basicConfig(level=logging.INFO)
20+
logger = logging.getLogger(__name__)
21+
22+
BATCH_SIZE = 500
23+
24+
25+
def main():
26+
parser = argparse.ArgumentParser(
27+
description="Backfill missing cell_line in bed_metadata"
28+
)
29+
parser.add_argument("config", help="Path to bbconf YAML config file")
30+
parser.add_argument(
31+
"--dry-run",
32+
action="store_true",
33+
help="Preview changes without writing to the database",
34+
)
35+
args = parser.parse_args()
36+
37+
agent = BedBaseAgent(config=args.config)
38+
39+
with Session(agent.config.db_engine.engine) as session:
40+
stmt = (
41+
select(Bed)
42+
.join(BedMetadata)
43+
.where(
44+
or_(
45+
BedMetadata.cell_line == "",
46+
BedMetadata.cell_line.is_(None),
47+
)
48+
)
49+
)
50+
beds = session.scalars(stmt).all()
51+
total = len(beds)
52+
logger.info(f"Found {total} records with missing cell_line")
53+
54+
updated = 0
55+
for i, bed in enumerate(beds, 1):
56+
parts = [bed.description, bed.annotations.original_file_name, bed.name]
57+
combined = " ".join(p for p in parts if p)
58+
cell_line = find_cell_line(combined)
59+
60+
if cell_line:
61+
logger.info(
62+
f" [{bed.id}] '{combined[:80]}...' -> cell_line='{cell_line}'"
63+
)
64+
if not args.dry_run:
65+
bed.annotations.cell_line = cell_line
66+
updated += 1
67+
68+
if not args.dry_run and i % BATCH_SIZE == 0:
69+
session.commit()
70+
logger.info(f" Committed batch ({i}/{total} processed so far)")
71+
72+
if not args.dry_run:
73+
session.commit()
74+
logger.info(f"Committed final batch. Updated {updated}/{total} records.")
75+
else:
76+
logger.info(f"[DRY RUN] Would update {updated}/{total} records")
77+
78+
79+
if __name__ == "__main__":
80+
main()

0 commit comments

Comments
 (0)