Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- nf-co2footplot plugin is added [#224](https://github.com/nf-core/variantbenchmarking/pull/224)
- Template update for nf-core/tools v3.4.1 [#235](https://github.com/nf-core/variantbenchmarking/pull/235)
- Adding concordance analysis (pairwise comparison of test VCFs) can be used without peforming benchmarking with truth VCF [#237](https://github.com/nf-core/variantbenchmarking/pull/237)
- Adding support for hap.py, som.py and truvari results to multiqc report. Also custumusing the report better [#249](https://github.com/nf-core/variantbenchmarking/pull/249)
- Adding support for hap.py, som.py and truvari results to multiqc report. Also refactoring the report better [#249](https://github.com/nf-core/variantbenchmarking/pull/249)

### `Fixed`

Expand All @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixing sompy split tag script [#230](https://github.com/nf-core/variantbenchmarking/pull/230)
- Wittyer doesnt support BND type of variants, added better documentation [#231](https://github.com/nf-core/variantbenchmarking/pull/231)
- Fixing the handling of params (when value is 0) from schema_input.json [#234](https://github.com/nf-core/variantbenchmarking/pull/234)
- Fixing and reformatting svlen distribution plot [#250](https://github.com/nf-core/variantbenchmarking/pull/250)

### `Dependencies`

Expand Down
239 changes: 183 additions & 56 deletions bin/plot_svlendist.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,21 @@
# A script to parse, analyze, and plot indel or structural variant lenght from VCF and CSV files.

# Copyright 2025 - GHGA
# Author: Kuebra Narci - @kubranarci
# Author: Victor Perez

import matplotlib.pyplot as plt
import os
import argparse
import numpy as np
import gzip
import csv
import pandas as pd
import matplotlib.transforms as mtrans

def get_sample_name(vcf_file):
base_name = os.path.basename(vcf_file)
return base_name.split('.')[0]

def format_bp_label(value, pos):
value = abs(value)
if value >= 1e6:
return f'{value / 1e6:.1f} Mbp'
elif value >= 1e3:
return f'{value / 1e3:.1f} kbp'
elif value > 0:
return f'{int(value)} bp'
else:
return '0'

def parse_vcf_data(vcf_files):
"""
Parses multiple VCF files, extracts SVLEN values, and determines the maximum SV length.
Expand Down Expand Up @@ -173,62 +164,187 @@ def parse_csv_data(csv_files):
return svlen_data, max_svlen


def plot_svlen_distributions(svlen_data, max_svlen, output_file, plot_title):
def format_bp_label(bin_edges,half_open=True,format="sci",decimals=1):
"""
Plots the SVLEN distributions based on the parsed data using a mirrored linear scale and a log y-axis.
Creates a string using the edges of the bins.
Returns:
A string of the form [lef_edge,right_edge]
"""
plt.rcParams.update({
'font.size': 14,
'axes.titlesize': 18,
'axes.labelsize': 16,
'xtick.labelsize': 14,
'ytick.labelsize': 14,
'legend.fontsize': 14,
'legend.title_fontsize': 16,
'figure.titlesize': 20
})

# Create larger figure
plt.figure(figsize=(14, 10))

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
line_styles = ['-', '--', ':', '-.']

num_bins = 50
bins = np.linspace(-max_svlen, max_svlen, num_bins * 2)

for i, (file_name, lengths_dict) in enumerate(svlen_data.items()):
current_color = colors[i % len(colors)]
current_linestyle = line_styles[i % len(line_styles)]
if format=="sci":
format_type="E"
elif format=="raw":
format_type="f"
if half_open:
symbol=")"
else:
symbol="]"

all_lengths = lengths_dict['positive'] + lengths_dict['negative']
label="[{left},{right}{S}".format(left= f"{bin_edges[0]:.{decimals}{format_type}}",
right=f"{bin_edges[1]:.{decimals}{format_type}}",
S=symbol
)
return label

if all_lengths:
counts, bin_edges = np.histogram(all_lengths, bins=bins)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
def default_bins():
"""
Creates a set of bins based on typical orders of magnitue(kilo,Mega,Giga,Tera)
Returns:
A list of bins as described above.

"""
exps=[1,2,3,6,9,12]#bp,kbp,Mbp,Gbp,Tbp
bins_pos=[10**i for i in exps]
bins_neg=[ -val for val in bins_pos ]
bins=sorted(bins_pos+bins_neg+[0])
return bins

def filter_frame(df):
"""
Filters a data frame based on the bin_label categories, if a category is empty for all
samples it will be filtered out.
Returns:
A filtered pandas data frame.

"""
group=df.groupby("bin_label",sort=False)["counts"]
categ=group.sum().index
idx_non_empty=group.sum().values>0
categ_filt=list(categ[idx_non_empty])
df_upt=df.loc[df["bin_label"].isin(categ_filt)]
return df_upt


def data2frame(sv_data,bin_edges="default"):
"""
Takes the sv_data and calculates the histogram of the counts of insertions and deletions
using the list of bin_edges.
Returns:
A pandas dataframe with the information organized for casting a bar plot.

plt.plot(bin_centers, counts, label=file_name,
color=current_color, linestyle=current_linestyle, linewidth=3)
"""

plt.yscale('log')
data_table={
"sample":[],
"bin_left_edge":[],
"bin_right_edge":[],
"bin_label":[],
"type":[],
"counts":[]
}

plt.xlabel("Deletions | Insertions", fontsize=16, fontweight='bold')
plt.ylabel("Count (Log10)", fontsize=16, fontweight='bold')
plt.title(plot_title, fontsize=18, fontweight='bold', pad=20)

legend = plt.legend(title="Tool", title_fontsize=16, fontsize=14)
for handle in legend.legendHandles:
handle.set_linewidth(4.0)
if bin_edges=="default":
bins=default_bins()
elif isinstance(bin_edges, (list, np.ndarray)):
bins=sorted(bin_edges)

plt.grid(True, which="both", linestyle='--', alpha=0.7)
bin_left_edge=bins[:-1]
bin_right_edge=bins[1::]
bin_intervals=list(zip(bin_left_edge,bin_right_edge))

plt.xlim(-max_svlen, max_svlen)
"""
Note: In the following 2 lines the label of the intervals
is formatted to comply with the np.histogram output, i.e.
all bins are considered half-open [) except the right-most bin which is closed
[]. Chek np.histogram documentation.
"""
bin_labels=list ( map( format_bp_label, bin_intervals[0:-1] ) )
bin_labels.append( format_bp_label(bin_intervals[-1],half_open=False) )
mod_type=["insertion" if l>=0 else "deletion" for l,r in bin_intervals ]

plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(format_bp_label))

plt.tight_layout()
for file_name, lengths_dict in sv_data.items():

plt.savefig(output_file, dpi=300, bbox_inches='tight')
all_lengths = lengths_dict['positive'] + lengths_dict['negative']

if all_lengths:
counts, _ = np.histogram(all_lengths, bins=bins)
data_table["sample"].extend([file_name]*len(counts))
data_table["bin_left_edge"].extend(bin_left_edge)
data_table["bin_right_edge"].extend(bin_right_edge)
data_table["bin_label"].extend(bin_labels)
data_table["type"].extend(mod_type)
data_table["counts"].extend(counts)

df_table=pd.DataFrame(data_table)
return df_table

def plot_svlen_distributions(sv_data, bins ,output_file, plot_title):
"""
CreateS a bar plot and writes it in the output_file path
Returns:
A .png figure saved in the output_file path.

"""
df_table=data2frame(sv_data,bin_edges=bins)
df_upt=filter_frame(df_table)

category_label=df_upt["bin_label"].unique()
files=df_upt["sample"].unique()
bar_height={sample: df_upt.loc[ df_upt["sample"]==sample, "counts"].values for sample in files }

types_list=df_upt["type"].unique()

width = 0.11 # the width of the bars
interval_capacity=1/width
interval_load=len(category_label)

if interval_load<interval_capacity:
x = np.arange(len(category_label)) # the label locations
else:
x=(width*interval_load)*np.array(list(range(interval_load)))


if len(types_list)==2:
xlabel_text="Deletions | Insertions"
xlabel_future_action="reposition"
ref_file=df_upt["sample"].unique()[0]
aux=df_upt.loc[df_upt["sample"]==ref_file]
transition_index=(aux["type"].values!="deletion").argmax()
xlabel_position=x[transition_index]

elif len(types_list)==1:
xlabel_future_action=None
if types_list[0]=="insertion":
xlabel_text="Insertions"

elif types_list[0]=="deletion":
xlabel_text="Deletions"

plt.style.use('seaborn-v0_8-colorblind')
fig, ax = plt.subplots(figsize=(14, 10),layout='constrained')
multiplier = 0
pads=[-1,2]
for attribute, measurement in bar_height.items():
offset = width * multiplier
rects = ax.bar(x + offset, measurement, width, label=attribute,alpha=1)
ax.bar_label(rects, padding=pads[multiplier%len(pads)],rotation=30)
multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_title(plot_title,fontsize=18, fontweight='bold', pad=20)
ax.grid(True, which="major", linestyle='--', alpha=0.6)
ax.set_xticks(x + width, category_label,rotation=35,fontsize=14)
ax.set_xlabel(xlabel_text, fontsize=16, fontweight='bold')
if xlabel_future_action=="reposition":
trans = mtrans.blended_transform_factory(ax.transData, ax.transAxes)
ax.xaxis.set_label_coords(xlabel_position, -0.3,transform=trans)

ax.set_yscale('log')
ax.tick_params(axis='y', which='major', labelsize=14)
ax.set_ylabel("Count (Log10)", fontsize=16, fontweight='bold')

ax.legend(title="Tool",
loc='upper right',
title_fontsize=16,
fancybox=True,
shadow=True,
bbox_to_anchor=(1.4, 1),
prop={'size': 14}
)
ax.set_facecolor((0.9,0.9,0.9))
fig.tight_layout()
fig.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"Plot saved to {output_file}")

if __name__ == "__main__":
Expand All @@ -238,6 +354,13 @@ def plot_svlen_distributions(svlen_data, max_svlen, output_file, plot_title):
help="The name of the output image file (default: svlen_distributions.png).")
parser.add_argument('--title', '-t', dest='plot_title', default='Structural Variant Length Distributions by Type',
help="The title for the plot.")
parser.add_argument('--bins', '-b',nargs='+', type=int,default=None,
help="""
List of integer numbers representing the bin_edges for the plot.
If no list is given the default edges are bp,kbp,Mbp,Gbp,Tbp in both
negative(Deletions) and positive(Insertions) directions. The counts of
the histogram are carried out by numpy.histogram.
""")

args = parser.parse_args()

Expand All @@ -260,6 +383,10 @@ def plot_svlen_distributions(svlen_data, max_svlen, output_file, plot_title):
max_len = csv_max_len

if sv_data:
plot_svlen_distributions(sv_data, max_len, args.output_file, args.plot_title)
if args.bins is None:
bin_edges="default"
else:
bin_edges=args.bins
plot_svlen_distributions(sv_data,bin_edges,args.output_file, args.plot_title)
else:
print("No valid input files found to plot.")
2 changes: 0 additions & 2 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

process {

// TODO nf-core: Check the defaults for all processes
cpus = { 1 * task.attempt }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
Expand All @@ -24,7 +23,6 @@ process {
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
// TODO nf-core: Customise requirements for specific processes.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_single {
cpus = { 1 }
Expand Down
75 changes: 75 additions & 0 deletions conf/gitpod.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run nf-core/variantbenchmarking -profile test,<docker/singularity> --outdir <OUTDIR>

----------------------------------------------------------------------------------------
*/

process {
resourceLimits = [
cpus: 4,
memory: '15.GB',
time: '1.h'
]

withName: 'BCFTOOLS_NORM*' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'BCFTOOLS_FILTER*' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'BCFTOOLS_SORT*' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'BCFTOOLS_REHEADER*' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'SVTK_STANDARDIZE' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'SUBTRACT_VCF*' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'SURVIVOR_MERGE*' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'BCFTOOLS_VIEW*' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'PICARD_CREATESEQUENCEDICTIONARY' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'PICARD_LIFTOVERVCF' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
withName: 'BCFTOOLS_RENAME*' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
}
}
Loading