Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bedboss/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.9.1"
__version__ = "0.9.2"
11 changes: 10 additions & 1 deletion bedboss/bbuploader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
)
from bedboss.bedboss import run_all
from bedboss.bedbuncher.bedbuncher import run_bedbuncher
from bedboss.const import MAX_FILE_SIZE
from bedboss.exceptions import BedBossException, QualityException
from bedboss.skipper import Skipper
from bedboss.refgenome_validator.main import ReferenceValidator
Expand Down Expand Up @@ -693,11 +694,19 @@ def _upload_gse(
)

# to speed up the process, we can run initial QC on the file
run_initial_qc(project_sample.file_url)
qc_file_size = run_initial_qc(project_sample.file_url)
if qc_file_size > 0:
sample_status.file_size = min(
qc_file_size, MAX_FILE_SIZE
) # we need to limit file size to MAX_FILE_SIZE for DB storage
except QualityException as err:
_LOGGER.error(f"Processing of '{sample_gsm}' failed with error: {str(err)}")
sample_status.status = STATUS.FAIL
sample_status.error = str(err)
if err.file_size > 0:
sample_status.file_size = min(
err.file_size, MAX_FILE_SIZE
) # we need to limit file size to MAX_FILE_SIZE for DB storage
project_status.number_of_failed += 1

if skipper_obj:
Expand Down
1 change: 1 addition & 0 deletions bedboss/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

# bedqc
MAX_FILE_SIZE = 1024 * 1024 * 1024 * 2
MAX_FILE_SIZE_QC = 1024 * 1024 * 25 # 25 MB
MAX_REGION_NUMBER = 5000000
MIN_REGION_WIDTH = 10

Expand Down
4 changes: 3 additions & 1 deletion bedboss/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ def __init__(self, reason: str = ""):
class QualityException(BedBossException):
"""Exception, when quality test of the bed file didn't pass."""

def __init__(self, reason: str = ""):
def __init__(self, reason: str = "", file_size: int = 0):
"""
Optionally provide explanation for exceptional condition.

:param str reason: reason why quality control wasn't successful
:param int file_size: file size in bytes (if available)
"""
self.reason = reason
self.file_size = file_size
super(QualityException, self).__init__(reason)


Expand Down
37 changes: 30 additions & 7 deletions bedboss/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from bedboss.refgenome_validator.main import ReferenceValidator
from bedboss.exceptions import QualityException
from bedboss.const import MIN_REGION_WIDTH
from bedboss.const import MIN_REGION_WIDTH, MAX_FILE_SIZE_QC

_LOGGER = logging.getLogger("bedboss")

Expand Down Expand Up @@ -242,18 +242,40 @@ def wrapper(*args, **kwargs):
return wrapper


def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> bool:
def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> int:
"""
Run initial QC on the bed file

:param url: URL of the file
:param min_region_width: Minimum region width threshold to pass the quality check. Default is 20

:return: bool. Returns True if QC passed, False if unable to open in pandas
:raises: QualityException
:return: int. File size in bytes (0 if unable to determine)
:raises: QualityException (includes file_size attribute)
"""
_LOGGER.info(f"Running initial QC on the bed file: {url}")

file_size = 0

# Check file size before downloading content
try:
# Convert ftp:// to https:// for the HEAD request (e.g., NCBI FTP supports HTTPS)
check_url = (
url.replace("ftp://", "https://") if url.startswith("ftp://") else url
)
response = requests.head(check_url, allow_redirects=True)
content_length = response.headers.get("Content-Length")
if content_length:
file_size = int(content_length)
if file_size > MAX_FILE_SIZE_QC:
file_size_mb = file_size / (1024 * 1024)
max_size_mb = MAX_FILE_SIZE_QC / (1024 * 1024)
raise QualityException(
f"Initial QC failed for '{url}'. File size is '{file_size_mb:.2f} MB', where max file size is set to: '{max_size_mb:.0f} MB'",
file_size=file_size,
)
except requests.RequestException as err:
_LOGGER.warning(f"Unable to check file size: {err}. Continuing with QC...")

try:
with urllib.request.urlopen(url) as response:
with gzip.GzipFile(fileobj=response) as f:
Expand All @@ -267,12 +289,13 @@ def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> bool:
"Unable to read the file, initial QC failed, but continuing anyway..."
f"Error: {str(err)}"
)
return False
return file_size

if mean_width < min_region_width:
raise QualityException(
f"Initial QC failed for '{url}'. Mean region width is '{mean_width}', where min region width is set to: '{min_region_width}'"
f"Initial QC failed for '{url}'. Mean region width is '{mean_width}', where min region width is set to: '{min_region_width}'",
file_size=file_size,
)

_LOGGER.info(f"Initial QC passed for {url}")
return True
return file_size
2 changes: 1 addition & 1 deletion requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ peppy>=0.40.7
yacman>=0.8.4
requests>=2.28.2
piper>=v0.14.3
bbconf>=0.14.2
bbconf>=0.14.3
# bbconf @ git+https://github.com/databio/bbconf.git@comp_search#egg=bbconf
refgenconf>=0.12.2
pandas>=2.0.0
Expand Down
3 changes: 2 additions & 1 deletion scripts/all/qc_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@
# url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8669nnn/GSM8669735/suppl/GSM8669735_ATAC-M1-1.narrowPeak.gz"
# url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8669nnn/GSM8669759/suppl/GSM8669759_H3K27ac-M2-1.narrowPeak.gz"
url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7163nnn/GSM7163568/suppl/GSM7163568_IRF3-ChIP-seq_HL_SVI_3h_exp1_2023_Total_peaks.bed.gz"
run_initial_qc(url)
file_size = run_initial_qc(url)
print(f"QC passed, File size: {file_size} bytes")
5 changes: 3 additions & 2 deletions scripts/bbuploader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ def run_gse():
# gse="gse157732", # series + samples test
# gse="gse209400", # series + samples test
# gse="gse206280",
gse="gse113157",
geo_tag="samples",
# gse="gse113157",
gse="gse174226", # too big series file - shoiuld fail
geo_tag="series",
bedbase_config="/home/bnt4me/virginia/repos/bedboss/config.yaml",
outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
# genome="HG38",
Expand Down
Loading