MannLabs · ammarcsj · Mar 17, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 13, 2025
diff --git a/alphaquant/config/quant_reader_config.yaml b/alphaquant/config/quant_reader_config.yaml
@@ -144,36 +144,63 @@ maxquant_peptides_leading_razor_protein:
 
 maxquant_evidence:
   format: longtable
-  sample_ID: Experiment #Raw file
-  quant_ID: Intensity
+  sample_ID: Experiment
+  quant_ID:
+    precursor: Intensity
   protein_cols:
    - Gene names
-  ion_cols:
-   - Modified sequence
-   - Charge
+  ion_hierarchy:
+    precursor:
+      order: [SEQ, MOD, CHARGE]
+      mapping:
+        SEQ:
+          - Sequence
+        MOD:
+          - Mass
+        CHARGE:
+          - Charge
+  filters:
+    reverse:
+      param: Reverse
+      comparator: "!="
+      value: "+"
+    contaminant:
+      param: Potential contaminant
+      comparator: "!="
+      value: "+"
+  ml_level: SEQ
+  use_iontree: False
+
 
 maxquant_evidence_protein:
   format: longtable
-  sample_ID: Experiment #Raw file
-  quant_ID: Intensity
+  sample_ID: Experiment
+  quant_ID:
+    precursor: Intensity
   protein_cols:
    - Protein group IDs
-  ion_cols:
-   - Modified sequence
-   - Charge
-
-
+  ion_hierarchy:
+    precursor:
+      order: [SEQ, MOD, CHARGE]
+      mapping:
+        SEQ:
+          - Sequence
+        MOD:
+          - Mass
+        CHARGE:
+          - Charge
+  filters:
+    reverse:
+      param: Reverse
+      comparator: "!="
+      value: "+"
+    contaminant:
+      param: Potential contaminant
+      comparator: "!="
+      value: "+"
+  ml_level: SEQ
+  use_iontree: False
 
-maxquant_evidence_proteins_column:
-  format: longtable
-  sample_ID: Experiment #Raw file
-  quant_ID: Intensity
-  protein_cols:
-   - Proteins
-  ion_cols:
-   - Sequence
-   - Modifications
-   - Charge
 
 diann_precursor_fragion_ms1:
   format: longtable
@@ -1261,27 +1288,6 @@ diaumpire_precursor_ms1:
    - Peptide Key
 
 
-diann_wideformat:
-  format: widetable
-  protein_cols:
-   - Protein.Group
-  ion_cols:
-   - Stripped.Sequence
-   - Modified.Sequence
-   - Precursor.Charge
-  ion_hierarchy:
-    sequence_int:
-      order: [SEQ, MOD]
-      mapping:
-        SEQ:
-          - Stripped.Sequence
-        MOD:
-          - Modified.Sequence
-        CH:
-          - Precursor.Charge
-  ml_level: SEQ
-  use_iontree: False
-
 fragpipe_precursors:
   format: widetable
   quant_pre_or_suffix: " Intensity"

diff --git a/alphaquant/diffquant/condpair_analysis.py b/alphaquant/diffquant/condpair_analysis.py
@@ -49,7 +49,7 @@ def analyze_condpair(*,runconfig, condpair):
         write_out_normed_df(df_c1_normed, df_c2_normed, pep2prot, runconfig.results_dir, condpair)
     normed_c1 = aqbg.ConditionBackgrounds(df_c1_normed, p2z)
     normed_c2 = aqbg.ConditionBackgrounds(df_c2_normed, p2z)
-    
+
     ions_to_check = normed_c1.ion2nonNanvals.keys() & normed_c2.ion2nonNanvals.keys()
     ions_to_check = sorted(ions_to_check)
 
@@ -82,16 +82,16 @@ def analyze_condpair(*,runconfig, condpair):
         ions = prot2diffions.get(prot)
         if len(ions)<runconfig.min_num_ions:
             continue
-        
-        clustered_prot_node = aqclust.get_scored_clusterselected_ions(prot, ions, normed_c1, normed_c2, bgpair2diffDist, p2z, deedpair2doublediffdist, 
-                                                                        pval_threshold_basis = runconfig.cluster_threshold_pval, fcfc_threshold = runconfig.cluster_threshold_fcfc, 
+
+        clustered_prot_node = aqclust.get_scored_clusterselected_ions(prot, ions, normed_c1, normed_c2, bgpair2diffDist, p2z, deedpair2doublediffdist,
+                                                                        pval_threshold_basis = runconfig.cluster_threshold_pval, fcfc_threshold = runconfig.cluster_threshold_fcfc,
                                                                         take_median_ion=runconfig.take_median_ion, fcdiff_cutoff_clustermerge= runconfig.fcdiff_cutoff_clustermerge)
         protnodes.append(clustered_prot_node)
 
         if count_prots%100==0:
             LOGGER.info(f"checked {count_prots} of {len(prot2diffions.keys())} prots")
         count_prots+=1
-    
+
     if len(prot2missingval_diffions.keys())>0:
         LOGGER.info(f"start analysis of proteins w. completely missing values")
 
@@ -102,13 +102,13 @@ def analyze_condpair(*,runconfig, condpair):
             ions = prot2missingval_diffions.get(prot)
             protnode_missingval = aq_clust_missingval.create_protnode_from_missingval_ions(gene_name=prot,diffions=ions, normed_c1=normed_c1, normed_c2=normed_c2)
             protnodes_missingval.append(protnode_missingval)
-        
+
         LOGGER.info(f"finished missing value analysis")
 
     if runconfig.use_ml:
         ml_performance_dict = {}
 
-        #aq_class_stacked_frag.assign_predictability_scores_stacked(protein_nodes= protnodes, acquisition_info_df=None,results_dir=runconfig.results_dir, name = aqutils.get_condpairname(condpair)+"_fragions", 
+        #aq_class_stacked_frag.assign_predictability_scores_stacked(protein_nodes= protnodes, acquisition_info_df=None,results_dir=runconfig.results_dir, name = aqutils.get_condpairname(condpair)+"_fragions",
          #                           min_num_fragions=5, replace_nans=True, performance_metrics=ml_performance_dict, plot_predictor_performance=True)
         ml_successfull =aq_class_precursors.assign_predictability_scores(protein_nodes= protnodes, results_dir=runconfig.results_dir, name = aqutils.get_condpairname(condpair), ml_info_file=runconfig.ml_input_file,
                                         samples_used =c1_samples + c2_samples, min_num_precursors=3, prot_fc_cutoff=0, replace_nans=True, performance_metrics=ml_performance_dict, plot_predictor_performance=runconfig.runtime_plots)
@@ -159,10 +159,10 @@ def get_per_condition_dataframes(samples_c1, samples_c2, unnormed_df, minrep_bot
 
     if min_samples<2:
         raise Exception(f"condpair has not enough samples: c1:{len(samples_c1)} c2: {len(samples_c2)}, skipping")
-    
+
     if (minrep_either is not None) or ((minrep_c1 is not None) and (minrep_c2 is not None)): #minrep_both was set as default and should be overruled by minrep_either or minrep_c1 and minrep_c2
         minrep_both = None
-            
+
     if minrep_either is not None:
         minrep_either = np.min([get_minrep_for_cond(samples_c1, minrep_either), get_minrep_for_cond(samples_c2, minrep_either)])
         passes_minrep_c1 = unnormed_df.loc[:, samples_c1].notna().sum(axis=1) >= minrep_either
@@ -184,7 +184,7 @@ def get_per_condition_dataframes(samples_c1, samples_c2, unnormed_df, minrep_bot
         df_c2 = unnormed_df.loc[:, samples_c2].dropna(thresh=minrep_c2, axis=0)
         if (len(df_c1.index)<5) | (len(df_c2.index)<5):
             raise Exception(f"condpair has not enough data for processing c1: {len(df_c1.index)} c2: {len(df_c2.index)}, skipping")
-        
+
     if (minrep_both is None) and (minrep_either is None) and (minrep_c1 is None) and (minrep_c2 is None):
         raise Exception("no minrep set, please specify!")
 
@@ -200,13 +200,13 @@ def get_minrep_for_cond(c_samples, minrep):
         return num_samples
     else:
         return minrep
-
 
-
+
+
 
 def write_out_tables(condpair_node, runconfig):
     condpair = condpair_node.name
-    
+
     res_df = aq_tablewriter_protein.TableFromNodeCreator(condpair_node, node_type = "gene", min_num_peptides = runconfig.minpep, annotation_file= getattr(runconfig, "annotation_file", None)).results_df
     has_sequence_nodes = check_if_has_sequence_nodes(condpair_node)
     if has_sequence_nodes:
@@ -235,14 +235,17 @@ def write_out_tables(condpair_node, runconfig):
         res_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.tsv", sep = "\t", index=None)
         if has_sequence_nodes:
             pep_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.seq.tsv", sep = "\t", index=None)
-        
+
         if has_precursor_nodes:
             prec_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.prec.tsv", sep = "\t", index=None)
-        
+
     return res_df, pep_df
 
 def check_if_has_sequence_nodes(condpair_node):
     return condpair_node.children[0].children[0].type == "seq"
 
 def check_if_has_precursor_nodes(condpair_node):
-    return condpair_node.children[0].children[0].children[0].children[0].type == "mod_seq_charge"
+    try:
+        return condpair_node.children[0].children[0].children[0].children[0].type == "mod_seq_charge"
+    except:
+        return False
diff --git a/alphaquant/run_pipeline.py b/alphaquant/run_pipeline.py
@@ -122,22 +122,26 @@ def run_pipeline(input_file: str,
     if samplemap_df is None:
         samplemap_df = aq_diffquant_utils.load_samplemap(samplemap_file)
 
-    input_type, _, _ = config_dict_loader.get_input_type_and_config_dict(input_file_original, input_type_to_use)
+    input_type, config_dict, _ = config_dict_loader.get_input_type_and_config_dict(input_file_original, input_type_to_use)
     annotation_file = load_annotation_file(input_file_original, input_type, annotation_columns)
+    use_ml = check_if_table_supports_ml(config_dict)
 
     if perform_ptm_mapping:
         if modification_type is None:
             raise Exception("modification_type is None, but perform_ptm_mapping is True. Please set perform_ptm_mapping to False or specify modification_type.")
         input_file_reformat = load_ptm_input_file(input_file = input_file_original, input_type_to_use = "spectronaut_ptm_fragion", results_dir = results_dir, samplemap_df = samplemap_df, modification_type = modification_type, organism = organism)
-        ml_input_file = load_ml_info_file(input_file_original, input_type, modification_type)
+        if use_ml:
+            ml_input_file = load_ml_info_file(input_file_original, input_type, modification_type)
 
     elif "fragment_precursorfiltered.matrix" in input_file_original:
         alphadia_tableprocessor = aq_table_alphadiareader.AlphaDIAFragTableProcessor(input_file_original)
         input_file_reformat = alphadia_tableprocessor.input_file_reformat
-        ml_input_file = alphadia_tableprocessor.ml_info_file
+        if use_ml:
+            ml_input_file = alphadia_tableprocessor.ml_info_file
     else:
         input_file_reformat = load_input_file(input_file_original, input_type)
-        ml_input_file = load_ml_info_file(input_file_original, input_type)
+        if use_ml:
+            ml_input_file = load_ml_info_file(input_file_original, input_type)
 
     if peptides_to_exclude_file is not None:
         remove_peptides_to_exclude_from_input_file(input_file_reformat, peptides_to_exclude_file)
@@ -233,6 +237,9 @@ def load_annotation_file(input_file, input_type, annotation_columns):
     else:
         return aq_tablewriter_misc.AnnotationFileCreator(input_file, input_type, annotation_columns).annotation_filename
 
+def check_if_table_supports_ml(config_dict):
+    return config_dict["format"] == "longtable"
+
 def load_ml_info_file(input_file, input_type, modification_type = None):
     ml_info_filename = aq_utils.get_progress_folder_filename(input_file, f".ml_info_table.tsv")
     if os.path.exists(ml_info_filename):#in case there already is a reformatted file, we don't need to reformat it again

diff --git a/alphaquant/ui/dashboard_parts_run_pipeline.py b/alphaquant/ui/dashboard_parts_run_pipeline.py
@@ -18,9 +18,10 @@
 import alphaquant.ui.dashboard_parts_plots_proteoforms as dashboad_parts_plots_proteoforms
 import alphaquant.ui.gui as gui
 import alphaquant.ui.gui_textfields as gui_textfields
+import alphaquant.utils.reader_utils as aq_reader_utils
 
 import alphabase.quantification.quant_reader.config_dict_loader as config_dict_loader
-config_dict_loader.INTABLE_CONFIG = os.path.join(pathlib.Path(__file__).parent.absolute(), "../config/quant_reader_config_lightweight.yaml")
+config_dict_loader.INTABLE_CONFIG = os.path.join(pathlib.Path(__file__).parent.absolute(), "../config/quant_reader_config.yaml")
 # If using Plotly in Panel
 pn.extension('plotly')
 
@@ -886,13 +887,33 @@ def _import_sample_names(self):
 
 				input_file = self.path_analysis_file.value
 				_, config_dict, sep = config_dict_loader.get_input_type_and_config_dict(input_file)
-				sample_column = config_dict["sample_ID"]
-				sample_names = set()
-				for chunk in pd.read_csv(input_file, sep=sep, usecols=[sample_column], chunksize=400000):
-					sample_names.update(chunk[sample_column].unique())
-				self.sample_names = sample_names
+				if config_dict["format"] == "longtable":
+					sample_column = config_dict["sample_ID"]
+					sample_names = set()
+
+					for chunk in aq_reader_utils.read_file(input_file, sep=sep, usecols=[sample_column], chunksize=400000):
+						sample_names.update(chunk[sample_column].unique())
+					self.sample_names = sample_names
+				elif config_dict["format"] == "widetable":
+					# Read the headers first to identify sample columns
+					headers = aq_reader_utils.read_file(input_file, sep=sep, nrows=0).columns.tolist()
+
+					quant_pre_or_suffix = config_dict.get("quant_pre_or_suffix")
+					# Filter headers to find those with the prefix or suffix
+					sample_columns = [
+						col for col in headers if (
+							col.startswith(quant_pre_or_suffix) or
+							col.endswith(quant_pre_or_suffix)
+						)
+					]
+					self.sample_names = set([col.replace(quant_pre_or_suffix, '') for col in sample_columns])
+				else:
+					print("ERROR: Could not idenfity sample names in the input file.")
+					self.run_pipeline_error.object = "Could not idenfity sample names . Please check your input file."
+					self.run_pipeline_error.visible = True
 
 			except Exception as e:
+				print(f"Error importing data: {e}")
 				self.run_pipeline_error.object = f"Error importing data: {e}"
 				self.run_pipeline_error.visible = True
 			finally:
@@ -1127,6 +1148,7 @@ def _update_samplemap(self, event):
 				self.state.notify_subscribers('samplemap_df')
 
 			except Exception as e:
+				print(f"Error reading sample map: {str(e)}")
 				self.run_pipeline_error.object = f"Error reading sample map: {str(e)}"
 				self.run_pipeline_error.visible = True
 
@@ -1156,6 +1178,10 @@ def _generate_samplemap(self, event):
 				self.template_success_message.object = f"""Template has been generated. Please fill out the condition column in the table below.\nThe template has also been saved to
 				<code>{template_path}</code>\nif you prefer to edit it with Excel or other applications."""
 				self.template_success_message.visible = True
+		except Exception as e:
+			print(f"Error generating sample map: {str(e)}")
+			self.run_pipeline_error.object = f"Error generating sample map: {str(e)}"
+			self.run_pipeline_error.visible = True
 		finally:
 			# Hide loading indicators when done
 			self.loading_samples_indicator.visible = False

diff --git a/alphaquant/utils/reader_utils.py b/alphaquant/utils/reader_utils.py
@@ -4,9 +4,11 @@
 LOGGER = logging.getLogger(__name__)
 
 
-def read_file(file_path, decimal=".", usecols=None, chunksize=None, sep=None):
+def read_file(file_path, decimal=".", usecols=None, chunksize=None, sep=None, nrows=None):
     file_path = str(file_path)
     if ".parquet" in file_path:
+        if nrows is not None:
+            LOGGER.warning(f"nrows parameter is set, but not supported for parquet files. Ignoring nrows parameter.")
         return _read_parquet_file(file_path, usecols=usecols, chunksize=chunksize)
     else:
         if sep is None:
@@ -26,6 +28,7 @@ def read_file(file_path, decimal=".", usecols=None, chunksize=None, sep=None):
             usecols=usecols,
             encoding="latin1",
             chunksize=chunksize,
+            nrows=nrows,
         )