Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 49 additions & 43 deletions alphaquant/config/quant_reader_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,36 +144,63 @@ maxquant_peptides_leading_razor_protein:

maxquant_evidence:
format: longtable
sample_ID: Experiment #Raw file
quant_ID: Intensity
sample_ID: Experiment
quant_ID:
precursor: Intensity
protein_cols:
- Gene names
ion_cols:
- Modified sequence
- Charge
ion_hierarchy:
precursor:
order: [SEQ, MOD, CHARGE]
mapping:
SEQ:
- Sequence
MOD:
- Mass
CHARGE:
- Charge
filters:
reverse:
param: Reverse
comparator: "!="
value: "+"
contaminant:
param: Potential contaminant
comparator: "!="
value: "+"
ml_level: SEQ
use_iontree: False


maxquant_evidence_protein:
format: longtable
sample_ID: Experiment #Raw file
quant_ID: Intensity
sample_ID: Experiment
quant_ID:
precursor: Intensity
protein_cols:
- Protein group IDs
ion_cols:
- Modified sequence
- Charge


ion_hierarchy:
precursor:
order: [SEQ, MOD, CHARGE]
mapping:
SEQ:
- Sequence
MOD:
- Mass
CHARGE:
- Charge
filters:
reverse:
param: Reverse
comparator: "!="
value: "+"
contaminant:
param: Potential contaminant
comparator: "!="
value: "+"
ml_level: SEQ
use_iontree: False

maxquant_evidence_proteins_column:
format: longtable
sample_ID: Experiment #Raw file
quant_ID: Intensity
protein_cols:
- Proteins
ion_cols:
- Sequence
- Modifications
- Charge

diann_precursor_fragion_ms1:
format: longtable
Expand Down Expand Up @@ -1261,27 +1288,6 @@ diaumpire_precursor_ms1:
- Peptide Key


diann_wideformat:
format: widetable
protein_cols:
- Protein.Group
ion_cols:
- Stripped.Sequence
- Modified.Sequence
- Precursor.Charge
ion_hierarchy:
sequence_int:
order: [SEQ, MOD]
mapping:
SEQ:
- Stripped.Sequence
MOD:
- Modified.Sequence
CH:
- Precursor.Charge
ml_level: SEQ
use_iontree: False

fragpipe_precursors:
format: widetable
quant_pre_or_suffix: " Intensity"
Expand Down
35 changes: 19 additions & 16 deletions alphaquant/diffquant/condpair_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def analyze_condpair(*,runconfig, condpair):
write_out_normed_df(df_c1_normed, df_c2_normed, pep2prot, runconfig.results_dir, condpair)
normed_c1 = aqbg.ConditionBackgrounds(df_c1_normed, p2z)
normed_c2 = aqbg.ConditionBackgrounds(df_c2_normed, p2z)

ions_to_check = normed_c1.ion2nonNanvals.keys() & normed_c2.ion2nonNanvals.keys()
ions_to_check = sorted(ions_to_check)

Expand Down Expand Up @@ -82,16 +82,16 @@ def analyze_condpair(*,runconfig, condpair):
ions = prot2diffions.get(prot)
if len(ions)<runconfig.min_num_ions:
continue
clustered_prot_node = aqclust.get_scored_clusterselected_ions(prot, ions, normed_c1, normed_c2, bgpair2diffDist, p2z, deedpair2doublediffdist,
pval_threshold_basis = runconfig.cluster_threshold_pval, fcfc_threshold = runconfig.cluster_threshold_fcfc,

clustered_prot_node = aqclust.get_scored_clusterselected_ions(prot, ions, normed_c1, normed_c2, bgpair2diffDist, p2z, deedpair2doublediffdist,
pval_threshold_basis = runconfig.cluster_threshold_pval, fcfc_threshold = runconfig.cluster_threshold_fcfc,
take_median_ion=runconfig.take_median_ion, fcdiff_cutoff_clustermerge= runconfig.fcdiff_cutoff_clustermerge)
protnodes.append(clustered_prot_node)

if count_prots%100==0:
LOGGER.info(f"checked {count_prots} of {len(prot2diffions.keys())} prots")
count_prots+=1

if len(prot2missingval_diffions.keys())>0:
LOGGER.info(f"start analysis of proteins w. completely missing values")

Expand All @@ -102,13 +102,13 @@ def analyze_condpair(*,runconfig, condpair):
ions = prot2missingval_diffions.get(prot)
protnode_missingval = aq_clust_missingval.create_protnode_from_missingval_ions(gene_name=prot,diffions=ions, normed_c1=normed_c1, normed_c2=normed_c2)
protnodes_missingval.append(protnode_missingval)

LOGGER.info(f"finished missing value analysis")

if runconfig.use_ml:
ml_performance_dict = {}

#aq_class_stacked_frag.assign_predictability_scores_stacked(protein_nodes= protnodes, acquisition_info_df=None,results_dir=runconfig.results_dir, name = aqutils.get_condpairname(condpair)+"_fragions",
#aq_class_stacked_frag.assign_predictability_scores_stacked(protein_nodes= protnodes, acquisition_info_df=None,results_dir=runconfig.results_dir, name = aqutils.get_condpairname(condpair)+"_fragions",
# min_num_fragions=5, replace_nans=True, performance_metrics=ml_performance_dict, plot_predictor_performance=True)
ml_successfull =aq_class_precursors.assign_predictability_scores(protein_nodes= protnodes, results_dir=runconfig.results_dir, name = aqutils.get_condpairname(condpair), ml_info_file=runconfig.ml_input_file,
samples_used =c1_samples + c2_samples, min_num_precursors=3, prot_fc_cutoff=0, replace_nans=True, performance_metrics=ml_performance_dict, plot_predictor_performance=runconfig.runtime_plots)
Expand Down Expand Up @@ -159,10 +159,10 @@ def get_per_condition_dataframes(samples_c1, samples_c2, unnormed_df, minrep_bot

if min_samples<2:
raise Exception(f"condpair has not enough samples: c1:{len(samples_c1)} c2: {len(samples_c2)}, skipping")

if (minrep_either is not None) or ((minrep_c1 is not None) and (minrep_c2 is not None)): #minrep_both was set as default and should be overruled by minrep_either or minrep_c1 and minrep_c2
minrep_both = None

if minrep_either is not None:
minrep_either = np.min([get_minrep_for_cond(samples_c1, minrep_either), get_minrep_for_cond(samples_c2, minrep_either)])
passes_minrep_c1 = unnormed_df.loc[:, samples_c1].notna().sum(axis=1) >= minrep_either
Expand All @@ -184,7 +184,7 @@ def get_per_condition_dataframes(samples_c1, samples_c2, unnormed_df, minrep_bot
df_c2 = unnormed_df.loc[:, samples_c2].dropna(thresh=minrep_c2, axis=0)
if (len(df_c1.index)<5) | (len(df_c2.index)<5):
raise Exception(f"condpair has not enough data for processing c1: {len(df_c1.index)} c2: {len(df_c2.index)}, skipping")

if (minrep_both is None) and (minrep_either is None) and (minrep_c1 is None) and (minrep_c2 is None):
raise Exception("no minrep set, please specify!")

Expand All @@ -200,13 +200,13 @@ def get_minrep_for_cond(c_samples, minrep):
return num_samples
else:
return minrep






def write_out_tables(condpair_node, runconfig):
condpair = condpair_node.name

res_df = aq_tablewriter_protein.TableFromNodeCreator(condpair_node, node_type = "gene", min_num_peptides = runconfig.minpep, annotation_file= getattr(runconfig, "annotation_file", None)).results_df
has_sequence_nodes = check_if_has_sequence_nodes(condpair_node)
if has_sequence_nodes:
Expand Down Expand Up @@ -235,14 +235,17 @@ def write_out_tables(condpair_node, runconfig):
res_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.tsv", sep = "\t", index=None)
if has_sequence_nodes:
pep_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.seq.tsv", sep = "\t", index=None)

if has_precursor_nodes:
prec_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.prec.tsv", sep = "\t", index=None)

return res_df, pep_df

def check_if_has_sequence_nodes(condpair_node):
return condpair_node.children[0].children[0].type == "seq"

def check_if_has_precursor_nodes(condpair_node):
return condpair_node.children[0].children[0].children[0].children[0].type == "mod_seq_charge"
try:
return condpair_node.children[0].children[0].children[0].children[0].type == "mod_seq_charge"
except:
return False
15 changes: 11 additions & 4 deletions alphaquant/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,22 +122,26 @@ def run_pipeline(input_file: str,
if samplemap_df is None:
samplemap_df = aq_diffquant_utils.load_samplemap(samplemap_file)

input_type, _, _ = config_dict_loader.get_input_type_and_config_dict(input_file_original, input_type_to_use)
input_type, config_dict, _ = config_dict_loader.get_input_type_and_config_dict(input_file_original, input_type_to_use)
annotation_file = load_annotation_file(input_file_original, input_type, annotation_columns)
use_ml = check_if_table_supports_ml(config_dict)

if perform_ptm_mapping:
if modification_type is None:
raise Exception("modification_type is None, but perform_ptm_mapping is True. Please set perform_ptm_mapping to False or specify modification_type.")
input_file_reformat = load_ptm_input_file(input_file = input_file_original, input_type_to_use = "spectronaut_ptm_fragion", results_dir = results_dir, samplemap_df = samplemap_df, modification_type = modification_type, organism = organism)
ml_input_file = load_ml_info_file(input_file_original, input_type, modification_type)
if use_ml:
ml_input_file = load_ml_info_file(input_file_original, input_type, modification_type)

elif "fragment_precursorfiltered.matrix" in input_file_original:
alphadia_tableprocessor = aq_table_alphadiareader.AlphaDIAFragTableProcessor(input_file_original)
input_file_reformat = alphadia_tableprocessor.input_file_reformat
ml_input_file = alphadia_tableprocessor.ml_info_file
if use_ml:
ml_input_file = alphadia_tableprocessor.ml_info_file
else:
input_file_reformat = load_input_file(input_file_original, input_type)
ml_input_file = load_ml_info_file(input_file_original, input_type)
if use_ml:
ml_input_file = load_ml_info_file(input_file_original, input_type)

if peptides_to_exclude_file is not None:
remove_peptides_to_exclude_from_input_file(input_file_reformat, peptides_to_exclude_file)
Expand Down Expand Up @@ -233,6 +237,9 @@ def load_annotation_file(input_file, input_type, annotation_columns):
else:
return aq_tablewriter_misc.AnnotationFileCreator(input_file, input_type, annotation_columns).annotation_filename

def check_if_table_supports_ml(config_dict):
return config_dict["format"] == "longtable"

def load_ml_info_file(input_file, input_type, modification_type = None):
ml_info_filename = aq_utils.get_progress_folder_filename(input_file, f".ml_info_table.tsv")
if os.path.exists(ml_info_filename):#in case there already is a reformatted file, we don't need to reformat it again
Expand Down
38 changes: 32 additions & 6 deletions alphaquant/ui/dashboard_parts_run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
import alphaquant.ui.dashboard_parts_plots_proteoforms as dashboad_parts_plots_proteoforms
import alphaquant.ui.gui as gui
import alphaquant.ui.gui_textfields as gui_textfields
import alphaquant.utils.reader_utils as aq_reader_utils

import alphabase.quantification.quant_reader.config_dict_loader as config_dict_loader
config_dict_loader.INTABLE_CONFIG = os.path.join(pathlib.Path(__file__).parent.absolute(), "../config/quant_reader_config_lightweight.yaml")
config_dict_loader.INTABLE_CONFIG = os.path.join(pathlib.Path(__file__).parent.absolute(), "../config/quant_reader_config.yaml")
# If using Plotly in Panel
pn.extension('plotly')

Expand Down Expand Up @@ -886,13 +887,33 @@ def _import_sample_names(self):

input_file = self.path_analysis_file.value
_, config_dict, sep = config_dict_loader.get_input_type_and_config_dict(input_file)
sample_column = config_dict["sample_ID"]
sample_names = set()
for chunk in pd.read_csv(input_file, sep=sep, usecols=[sample_column], chunksize=400000):
sample_names.update(chunk[sample_column].unique())
self.sample_names = sample_names
if config_dict["format"] == "longtable":
sample_column = config_dict["sample_ID"]
sample_names = set()

for chunk in aq_reader_utils.read_file(input_file, sep=sep, usecols=[sample_column], chunksize=400000):
sample_names.update(chunk[sample_column].unique())
self.sample_names = sample_names
elif config_dict["format"] == "widetable":
# Read the headers first to identify sample columns
headers = aq_reader_utils.read_file(input_file, sep=sep, nrows=0).columns.tolist()

quant_pre_or_suffix = config_dict.get("quant_pre_or_suffix")
# Filter headers to find those with the prefix or suffix
sample_columns = [
col for col in headers if (
col.startswith(quant_pre_or_suffix) or
col.endswith(quant_pre_or_suffix)
)
]
self.sample_names = set([col.replace(quant_pre_or_suffix, '') for col in sample_columns])
else:
print("ERROR: Could not idenfity sample names in the input file.")
self.run_pipeline_error.object = "Could not idenfity sample names . Please check your input file."
self.run_pipeline_error.visible = True

except Exception as e:
print(f"Error importing data: {e}")
self.run_pipeline_error.object = f"Error importing data: {e}"
self.run_pipeline_error.visible = True
finally:
Expand Down Expand Up @@ -1127,6 +1148,7 @@ def _update_samplemap(self, event):
self.state.notify_subscribers('samplemap_df')

except Exception as e:
print(f"Error reading sample map: {str(e)}")
self.run_pipeline_error.object = f"Error reading sample map: {str(e)}"
self.run_pipeline_error.visible = True

Expand Down Expand Up @@ -1156,6 +1178,10 @@ def _generate_samplemap(self, event):
self.template_success_message.object = f"""Template has been generated. Please fill out the condition column in the table below.\nThe template has also been saved to
<code>{template_path}</code>\nif you prefer to edit it with Excel or other applications."""
self.template_success_message.visible = True
except Exception as e:
print(f"Error generating sample map: {str(e)}")
self.run_pipeline_error.object = f"Error generating sample map: {str(e)}"
self.run_pipeline_error.visible = True
finally:
# Hide loading indicators when done
self.loading_samples_indicator.visible = False
Expand Down
5 changes: 4 additions & 1 deletion alphaquant/utils/reader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
LOGGER = logging.getLogger(__name__)


def read_file(file_path, decimal=".", usecols=None, chunksize=None, sep=None):
def read_file(file_path, decimal=".", usecols=None, chunksize=None, sep=None, nrows=None):
file_path = str(file_path)
if ".parquet" in file_path:
if nrows is not None:
LOGGER.warning(f"nrows parameter is set, but not supported for parquet files. Ignoring nrows parameter.")
return _read_parquet_file(file_path, usecols=usecols, chunksize=chunksize)
else:
if sep is None:
Expand All @@ -26,6 +28,7 @@ def read_file(file_path, decimal=".", usecols=None, chunksize=None, sep=None):
usecols=usecols,
encoding="latin1",
chunksize=chunksize,
nrows=nrows,
)


Expand Down
Loading
Loading