biocore
diff --git a/‎micov/__init__.py‎
Lines changed: 0 additions & 7 deletions b/‎micov/__init__.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎micov/_quant.py‎
Lines changed: 28 additions & 43 deletions b/‎micov/_quant.py‎
Lines changed: 28 additions & 43 deletions
diff --git a/‎micov/_view.py‎
Lines changed: 31 additions & 28 deletions b/‎micov/_view.py‎
Lines changed: 31 additions & 28 deletions
@@ -1,8 +1 @@
 """micov: microbiome coverage."""
-
-# note: currently for use with duckdb. we cannot easily enforce threads for polars
-# as a specific environment variable must be set prior to the first import. it's
-# doable but will need some engineeering to do it correctly.'And, polars does not
-# currently have a way to limit memory use.
-THREADS = 1
-MEMORY = 8  # gb
@@ -3,7 +3,12 @@
 import numpy as np
 import polars as pl
 
-from ._constants import COLUMN_GENOME_ID, COLUMN_SAMPLE_ID
+from ._constants import (
+    COLUMN_GENOME_ID,
+    COLUMN_SAMPLE_ID,
+    COLUMN_START_DTYPE,
+    COLUMN_STOP_DTYPE,
+)
 
 warnings.simplefilter("ignore", category=pl.exceptions.PerformanceWarning)
 
@@ -26,21 +31,23 @@ def create_bin_list(genome_length, bin_num):
         pl.Series("a", [0, genome_length], strict=False)
         .hist(bin_count=bin_num)
         .lazy()
-        .select(pl.col("breakpoint").alias("bin_stop"))
+        .select(pl.col("breakpoint").round().cast(COLUMN_STOP_DTYPE).alias("bin_stop"))
         .with_row_index("bin_idx", offset=1)
     )
     bin_list_pos_start = (
         pl.Series("a", [0, genome_length], strict=False)
         .hist(bin_count=bin_num)
         .lazy()
-        .select(pl.col("breakpoint").alias("bin_start"))
+        .select(
+            pl.col("breakpoint").round().cast(COLUMN_START_DTYPE).alias("bin_start")
+        )
         .with_row_index("bin_idx", offset=2)
     )
     bin_list = (
         bin_list_pos_start.join(bin_list_pos_stop, on="bin_idx", how="right")
         .fill_null(0)
         .select([pl.col("bin_idx"), pl.col("bin_start"), pl.col("bin_stop")])
-        .with_columns(pl.col("bin_idx").cast(pl.Int64))
+        .with_columns(pl.col("bin_idx").cast(COLUMN_START_DTYPE))
     )
     # setting the bin_stop of the last bin to be exactly the genome length + 1
     bin_list = bin_list.with_columns(
@@ -52,51 +59,30 @@ def create_bin_list(genome_length, bin_num):
     return bin_list
 
 
-def pos_to_bins(pos, variable, bin_num):
-    genome_length = pos.select("length").limit(1).collect().item()
+def pos_to_bins(pos, variable, bin_num, genome_length):
+    # genome_length = pos.select("length").limit(1).collect().item()
     bin_list = create_bin_list(genome_length, bin_num)
 
     # get start_bin_idx and stop_bin_idx
     bin_edges = [0.0] + bin_list.select(  # noqa: RUF005
         pl.col("bin_stop")
     ).collect().to_series().to_list()
-    cut_start = (
-        pos.select(pl.col("start"))
-        .collect()
-        .to_series()
-        .cut(
-            bin_edges,
-            labels=np.arange(len(bin_edges) + 1).astype(str),
-            left_closed=True,
-        )
-        .cast(pl.Int64)
-        .alias("start_bin_idx")
-    )
-    cut_stop = (
-        pos.select(pl.col("stop"))
-        .collect()
-        .to_series()
-        .cut(
-            bin_edges,
-            labels=np.arange(len(bin_edges) + 1).astype(str),
-            left_closed=False,
-        )
-        .cast(pl.Int64)
-        .alias("stop_bin_idx")
-    )
-    pos = pos.with_columns([cut_start, cut_stop])
-
-    # update stop_bin_idx +1 for pl.arange and generate range of bins
-    pos = pos.with_columns((pl.col("stop_bin_idx") + 1).alias("stop_bin_idx_add1"))
+    labels = np.arange(len(bin_edges) + 1).astype(str)
 
-    # generate range of bins covered
-    pos = pos.with_columns(
-        pl.int_ranges("start_bin_idx", "stop_bin_idx_add1").alias("bin_idx")
-    ).drop("stop_bin_idx_add1")
-
-    # generate bin_df
-    df_bins = (
-        pos.explode("bin_idx")
+    return (
+        pos.with_columns(
+            pl.col("start")
+            .cut(bin_edges, labels=labels, left_closed=True)
+            .cast(COLUMN_START_DTYPE)
+            .alias("start_bin_idx"),
+            pl.col("stop")
+            .cut(bin_edges, labels=labels, left_closed=False)
+            .cast(COLUMN_STOP_DTYPE)
+            .alias("stop_bin_idx")
+            + 1,
+        )
+        .with_columns(pl.int_ranges("start_bin_idx", "stop_bin_idx").alias("bin_idx"))
+        .explode("bin_idx")
         .group_by(COLUMN_GENOME_ID, variable, "bin_idx")
         .agg(
             pl.col("start").len().alias("read_hits"),
@@ -106,4 +92,3 @@ def pos_to_bins(pos, variable, bin_num):
         .sort(by=["bin_idx", variable])
         .join(bin_list, how="left", on="bin_idx")
     )
-    return df_bins
@@ -3,7 +3,6 @@
 import duckdb
 import polars as pl
 
-from micov import MEMORY, THREADS
 from micov._constants import (
     COLUMN_COVERED,
     COLUMN_GENOME_ID,
@@ -19,7 +18,9 @@
 class View:
     """View subsets of coverage data."""
 
-    def __init__(self, dbbase, sample_metadata, features_to_keep):
+    def __init__(
+        self, dbbase, sample_metadata, features_to_keep, threads=1, memory="8gb"
+    ):
         self.dbbase = dbbase
         self.sample_metadata = sample_metadata
         self.features_to_keep = features_to_keep
@@ -28,7 +29,7 @@ def __init__(self, dbbase, sample_metadata, features_to_keep):
         self.constrain_features = False
 
         self.con = duckdb.connect(
-            ":memory:", config={"threads": THREADS, "memory_limit": f"{MEMORY}gb"}
+            ":memory:", config={"threads": threads, "memory_limit": f"{memory}"}
         )
         self._init()
 
@@ -106,9 +107,9 @@ def _load_db(self):
             # TODO: replace with duckdb native per sample compression
             #   Do we stream to parquet? this could be large
             positions_df = self.con.sql(f"""SELECT * FROM positions
-                                         ORDER BY {COLUMN_SAMPLE_ID},
-                                                  {COLUMN_GENOME_ID},
-                                                  {COLUMN_START}""").pl()
+                                            ORDER BY {COLUMN_SAMPLE_ID},
+                                                     {COLUMN_GENOME_ID},
+                                                     {COLUMN_START}""").pl()
 
             if len(positions_df) == 0:
                 msg = "No positions left after filtering."
@@ -135,7 +136,7 @@ def _load_db(self):
                             SELECT * FROM recomputed_coverage""")
 
             self.con.sql(f"""CREATE TABLE feature_metadata AS
-                             SELECT *
+                             SELECT *, {COLUMN_STOP} - {COLUMN_START} AS {COLUMN_LENGTH}
                              FROM feature_constraint fc
                                  SEMI JOIN coverage cov USING ({COLUMN_GENOME_ID})""")
 
@@ -158,18 +159,19 @@ def _load_db(self):
                                  JOIN metadata md
                                      ON pos.{COLUMN_SAMPLE_ID}=md.{COLUMN_SAMPLE_ID}""")
             self.con.sql(f"""CREATE VIEW genome_lengths AS
-                                 SELECT {COLUMN_GENOME_ID},
-                                        FIRST({COLUMN_LENGTH}) AS {COLUMN_STOP}
-                                 FROM coverage
-                                 GROUP BY {COLUMN_GENOME_ID};
-                             CREATE TABLE feature_metadata AS
-                                 SELECT fc.{COLUMN_GENOME_ID},
+                                    SELECT {COLUMN_GENOME_ID},
+                                        FIRST({COLUMN_LENGTH}) AS {COLUMN_LENGTH}
+                                    FROM coverage
+                                    GROUP BY {COLUMN_GENOME_ID};
+                                CREATE TABLE feature_metadata AS
+                                    SELECT f.{COLUMN_GENOME_ID},
                                         0::UINTEGER AS {COLUMN_START},
-                                        gl.{COLUMN_STOP}
-                                 FROM feature_constraint fc
-                                     JOIN genome_lengths gl
-                                         ON fc.{COLUMN_GENOME_ID}=gl.{COLUMN_GENOME_ID}
-                    """)
+                                        g.{COLUMN_LENGTH} AS {COLUMN_STOP},
+                                        g.{COLUMN_LENGTH}
+                                    FROM feature_constraint f
+                                        JOIN genome_lengths g
+                                            ON f.{COLUMN_GENOME_ID}=g.{COLUMN_GENOME_ID}
+                        """)
         else:
             # limit the samples considered
             self.con.sql(f"""CREATE VIEW coverage AS
@@ -191,17 +193,18 @@ def _load_db(self):
             # use the existing length data from coverage to set the start/stop
             # positions in feature_metadata
             self.con.sql(f"""CREATE VIEW genome_lengths AS
-                                 SELECT {COLUMN_GENOME_ID},
-                                        FIRST({COLUMN_LENGTH}) AS {COLUMN_STOP}
-                                 FROM coverage
-                                 GROUP BY {COLUMN_GENOME_ID};
-                             CREATE TABLE feature_metadata AS
-                                 SELECT fc.{COLUMN_GENOME_ID},
+                                    SELECT {COLUMN_GENOME_ID},
+                                        FIRST({COLUMN_LENGTH}) AS {COLUMN_LENGTH}
+                                    FROM coverage
+                                    GROUP BY {COLUMN_GENOME_ID};
+                                CREATE TABLE feature_metadata AS
+                                    SELECT f.{COLUMN_GENOME_ID},
                                         0::UINTEGER AS {COLUMN_START},
-                                        gl.{COLUMN_STOP}
-                                 FROM feature_constraint fc
-                                     JOIN genome_lengths gl
-                                         ON fc.{COLUMN_GENOME_ID}=gl.{COLUMN_GENOME_ID}
+                                        g.{COLUMN_LENGTH} AS {COLUMN_STOP},
+                                        g.{COLUMN_LENGTH}
+                                    FROM feature_constraint f
+                                        JOIN genome_lengths g
+                                            ON f.{COLUMN_GENOME_ID}=g.{COLUMN_GENOME_ID}
                     """)
 
     def metadata(self):