33import duckdb
44import polars as pl
55
6- from micov import MEMORY , THREADS
76from micov ._constants import (
87 COLUMN_COVERED ,
98 COLUMN_GENOME_ID ,
1918class View :
2019 """View subsets of coverage data."""
2120
22- def __init__ (self , dbbase , sample_metadata , features_to_keep ):
21+ def __init__ (
22+ self , dbbase , sample_metadata , features_to_keep , threads = 1 , memory = "8gb"
23+ ):
2324 self .dbbase = dbbase
2425 self .sample_metadata = sample_metadata
2526 self .features_to_keep = features_to_keep
@@ -28,7 +29,7 @@ def __init__(self, dbbase, sample_metadata, features_to_keep):
2829 self .constrain_features = False
2930
3031 self .con = duckdb .connect (
31- ":memory:" , config = {"threads" : THREADS , "memory_limit" : f"{ MEMORY } gb " }
32+ ":memory:" , config = {"threads" : threads , "memory_limit" : f"{ memory } " }
3233 )
3334 self ._init ()
3435
@@ -106,9 +107,9 @@ def _load_db(self):
106107 # TODO: replace with duckdb native per sample compression
107108 # Do we stream to parquet? this could be large
108109 positions_df = self .con .sql (f"""SELECT * FROM positions
109- ORDER BY { COLUMN_SAMPLE_ID } ,
110- { COLUMN_GENOME_ID } ,
111- { COLUMN_START } """ ).pl ()
110+ ORDER BY { COLUMN_SAMPLE_ID } ,
111+ { COLUMN_GENOME_ID } ,
112+ { COLUMN_START } """ ).pl ()
112113
113114 if len (positions_df ) == 0 :
114115 msg = "No positions left after filtering."
@@ -135,7 +136,7 @@ def _load_db(self):
135136 SELECT * FROM recomputed_coverage""" )
136137
137138 self .con .sql (f"""CREATE TABLE feature_metadata AS
138- SELECT *
139+ SELECT *, { COLUMN_STOP } - { COLUMN_START } AS { COLUMN_LENGTH }
139140 FROM feature_constraint fc
140141 SEMI JOIN coverage cov USING ({ COLUMN_GENOME_ID } )""" )
141142
@@ -158,18 +159,19 @@ def _load_db(self):
158159 JOIN metadata md
159160 ON pos.{ COLUMN_SAMPLE_ID } =md.{ COLUMN_SAMPLE_ID } """ )
160161 self .con .sql (f"""CREATE VIEW genome_lengths AS
161- SELECT { COLUMN_GENOME_ID } ,
162- FIRST({ COLUMN_LENGTH } ) AS { COLUMN_STOP }
163- FROM coverage
164- GROUP BY { COLUMN_GENOME_ID } ;
165- CREATE TABLE feature_metadata AS
166- SELECT fc .{ COLUMN_GENOME_ID } ,
162+ SELECT { COLUMN_GENOME_ID } ,
163+ FIRST({ COLUMN_LENGTH } ) AS { COLUMN_LENGTH }
164+ FROM coverage
165+ GROUP BY { COLUMN_GENOME_ID } ;
166+ CREATE TABLE feature_metadata AS
167+ SELECT f .{ COLUMN_GENOME_ID } ,
167168 0::UINTEGER AS { COLUMN_START } ,
168- gl.{ COLUMN_STOP }
169- FROM feature_constraint fc
170- JOIN genome_lengths gl
171- ON fc.{ COLUMN_GENOME_ID } =gl.{ COLUMN_GENOME_ID }
172- """ )
169+ g.{ COLUMN_LENGTH } AS { COLUMN_STOP } ,
170+ g.{ COLUMN_LENGTH }
171+ FROM feature_constraint f
172+ JOIN genome_lengths g
173+ ON f.{ COLUMN_GENOME_ID } =g.{ COLUMN_GENOME_ID }
174+ """ )
173175 else :
174176 # limit the samples considered
175177 self .con .sql (f"""CREATE VIEW coverage AS
@@ -191,17 +193,18 @@ def _load_db(self):
191193 # use the existing length data from coverage to set the start/stop
192194 # positions in feature_metadata
193195 self .con .sql (f"""CREATE VIEW genome_lengths AS
194- SELECT { COLUMN_GENOME_ID } ,
195- FIRST({ COLUMN_LENGTH } ) AS { COLUMN_STOP }
196- FROM coverage
197- GROUP BY { COLUMN_GENOME_ID } ;
198- CREATE TABLE feature_metadata AS
199- SELECT fc .{ COLUMN_GENOME_ID } ,
196+ SELECT { COLUMN_GENOME_ID } ,
197+ FIRST({ COLUMN_LENGTH } ) AS { COLUMN_LENGTH }
198+ FROM coverage
199+ GROUP BY { COLUMN_GENOME_ID } ;
200+ CREATE TABLE feature_metadata AS
201+ SELECT f .{ COLUMN_GENOME_ID } ,
200202 0::UINTEGER AS { COLUMN_START } ,
201- gl.{ COLUMN_STOP }
202- FROM feature_constraint fc
203- JOIN genome_lengths gl
204- ON fc.{ COLUMN_GENOME_ID } =gl.{ COLUMN_GENOME_ID }
203+ g.{ COLUMN_LENGTH } AS { COLUMN_STOP } ,
204+ g.{ COLUMN_LENGTH }
205+ FROM feature_constraint f
206+ JOIN genome_lengths g
207+ ON f.{ COLUMN_GENOME_ID } =g.{ COLUMN_GENOME_ID }
205208 """ )
206209
207210 def metadata (self ):
0 commit comments