Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,7 @@ examples/new_test_examples/*

# Ignore new tools examples
examples/new_tools/

#apple system files
.DS_Store

42 changes: 38 additions & 4 deletions ProtPeptigram/DataProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def remove_ptm(self, peptide: str) -> str:

return clean_peptide

def extract_protein_ids(self, accession_value: str) -> List[str]:
def extract_protein_ids(self, accession_value: str, protein_pattern=None ) -> List[str]:
"""
Extract protein IDs from an accession value, which may contain multiple IDs.

Expand All @@ -191,15 +191,49 @@ def extract_protein_ids(self, accession_value: str) -> List[str]:
"""
if pd.isna(accession_value) or accession_value == "":
return []

if protein_pattern is None:
# Default pattern to split by common delimiters
protein_pattern = r'[,:;|/\s]+'
else:
# Use the provided regex pattern if specified
protein_pattern = re.compile(protein_pattern)
# Split by common delimiters
protein_ids = re.split(r'[,:;|/\s]+', accession_value)
protein_ids = re.split(protein_pattern, accession_value)

# Remove empty entries and trim whitespace
protein_ids = [pid.strip() for pid in protein_ids if pid.strip()]

return protein_ids

def extract_protein_isoforms(self, accession_value: str, protein_pattern: str = r'[,:;|/\\s]+') -> List[str]:
"""
Extract protein isoforms from an accession value, splitting by delimiters and extracting isoform info.

Parameters:
-----------
accession_value : str
Accession value from the PEAKS output
protein_pattern : str, optional
Regex pattern to split multiple protein IDs (default: r'[,:;|/\s]+')

Returns:
--------
List[str]: List of protein isoforms (e.g., 'P12345-2' or 'P12345.2' if present, else just 'P12345')
"""
if pd.isna(accession_value) or accession_value == "":
return []
# Split by common delimiters
protein_ids = re.split(protein_pattern, accession_value)
isoforms = []
for pid in protein_ids:
pid = pid.strip()
if not pid:
continue
# Match patterns like "P12345-1" or "P12345.1"
isoform_match = re.match(r'^([A-Z0-9]+(?:[-\.]\d+)?)', pid)
if isoform_match:
isoforms.append(isoform_match.group(1))
return isoforms

def find_peptide_position(self, peptide: str, protein_sequence: str) -> Tuple[int, int]:
"""
Find the start and end positions of a peptide in a protein sequence.
Expand Down
2 changes: 1 addition & 1 deletion ProtPeptigram/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@
#controll for __all__ to limit what is imported when using 'from module import *'
# __all__ = ['PeptideDataProcessor', 'ImmunoViz']

__version__ = "1.1.0-dev"
__version__ = "1.1.1-dev"
__author__ = "Sanjay Krishna,Prithvi Munday,Chen Li"
__email__ = "sanjay.sondekoppagopalakrishna@monash.edu"
25 changes: 21 additions & 4 deletions ProtPeptigram/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,26 @@ def run_pipeline(

# Read specific protein list if provided
specific_proteins = None
if protein_list and os.path.exists(protein_list):
specific_proteins = read_protein_list(protein_list)
console.log(f"Using {len(specific_proteins)} proteins from provided list.", style="bold")
# console.log(f"Checking for specific protein list...{protein_list}", style="bold")
if protein_list:
console.log(f"Protein list provided: {protein_list}", style="bold yellow")
try:
if isinstance(protein_list, str):
console.log(f"Checking if protein_list is a file or string: {protein_list}", style="bold yellow")
specific_proteins = [protein_list] # If it's a single string, treat it as a list with one protein
# If protein_list is a string, assume it's a comma-separated list
elif isinstance(protein_list, str) and ',' in protein_list:
console.log("Parsing comma-separated protein list...", style="bold green")
specific_proteins = str(protein_list).split(",")

elif protein_list and os.path.exists(protein_list):
console.log(f"Reading specific protein list from {protein_list}", style="bold green")
specific_proteins = read_protein_list(protein_list)
else:
specific_proteins = read_protein_list(protein_list)
except FileNotFoundError:
console.log(f"Error reading protein list file: {protein_list}", style="bold red")
# raise

# 1. Initialize the data processor
processor = PeptideDataProcessor()
Expand Down Expand Up @@ -184,7 +201,7 @@ def run_pipeline(
group_by='Sample',
color_by='protein',
figsize=(14, 12),
title=f"Peptide-Protein alignment visualisation - {prot}",
title=f"Prot-Petigram :Peptide-Protein alignment visualisation for {prot}",
color_by_protein_and_intensity=False,
intensity_cmaps=["Blues", "Reds", "Greens", "Purples"],
protein_cmap="Set1",
Expand Down
Loading