diff --git a/bedboss/_version.py b/bedboss/_version.py index d69d16e..a2fecb4 100644 --- a/bedboss/_version.py +++ b/bedboss/_version.py @@ -1 +1 @@ -__version__ = "0.9.1" +__version__ = "0.9.2" diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index 7af38ce..08cc1b0 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -34,6 +34,7 @@ ) from bedboss.bedboss import run_all from bedboss.bedbuncher.bedbuncher import run_bedbuncher +from bedboss.const import MAX_FILE_SIZE from bedboss.exceptions import BedBossException, QualityException from bedboss.skipper import Skipper from bedboss.refgenome_validator.main import ReferenceValidator @@ -693,11 +694,19 @@ def _upload_gse( ) # to speed up the process, we can run initial QC on the file - run_initial_qc(project_sample.file_url) + qc_file_size = run_initial_qc(project_sample.file_url) + if qc_file_size > 0: + sample_status.file_size = min( + qc_file_size, MAX_FILE_SIZE + ) # we need to limit file size to MAX_FILE_SIZE for DB storage except QualityException as err: _LOGGER.error(f"Processing of '{sample_gsm}' failed with error: {str(err)}") sample_status.status = STATUS.FAIL sample_status.error = str(err) + if err.file_size > 0: + sample_status.file_size = min( + err.file_size, MAX_FILE_SIZE + ) # we need to limit file size to MAX_FILE_SIZE for DB storage project_status.number_of_failed += 1 if skipper_obj: diff --git a/bedboss/const.py b/bedboss/const.py index 12ce79b..115cf03 100644 --- a/bedboss/const.py +++ b/bedboss/const.py @@ -23,6 +23,7 @@ # bedqc MAX_FILE_SIZE = 1024 * 1024 * 1024 * 2 +MAX_FILE_SIZE_QC = 1024 * 1024 * 25 # 25 MB MAX_REGION_NUMBER = 5000000 MIN_REGION_WIDTH = 10 diff --git a/bedboss/exceptions.py b/bedboss/exceptions.py index 84a52bd..3d1502d 100644 --- a/bedboss/exceptions.py +++ b/bedboss/exceptions.py @@ -27,13 +27,15 @@ def __init__(self, reason: str = ""): class QualityException(BedBossException): """Exception, when quality test of the bed file didn't pass.""" - def __init__(self, reason: str = ""): + def __init__(self, reason: str = "", file_size: int = 0): """ Optionally provide explanation for exceptional condition. :param str reason: reason why quality control wasn't successful + :param int file_size: file size in bytes (if available) """ self.reason = reason + self.file_size = file_size super(QualityException, self).__init__(reason) diff --git a/bedboss/utils.py b/bedboss/utils.py index b8d4c57..7888e9b 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -19,7 +19,7 @@ from bedboss.refgenome_validator.main import ReferenceValidator from bedboss.exceptions import QualityException -from bedboss.const import MIN_REGION_WIDTH +from bedboss.const import MIN_REGION_WIDTH, MAX_FILE_SIZE_QC _LOGGER = logging.getLogger("bedboss") @@ -242,18 +242,40 @@ def wrapper(*args, **kwargs): return wrapper -def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> bool: +def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> int: """ Run initial QC on the bed file :param url: URL of the file :param min_region_width: Minimum region width threshold to pass the quality check. Default is 20 - :return: bool. Returns True if QC passed, False if unable to open in pandas - :raises: QualityException + :return: int. File size in bytes (0 if unable to determine) + :raises: QualityException (includes file_size attribute) """ _LOGGER.info(f"Running initial QC on the bed file: {url}") + file_size = 0 + + # Check file size before downloading content + try: + # Convert ftp:// to https:// for the HEAD request (e.g., NCBI FTP supports HTTPS) + check_url = ( + url.replace("ftp://", "https://") if url.startswith("ftp://") else url + ) + response = requests.head(check_url, allow_redirects=True) + content_length = response.headers.get("Content-Length") + if content_length: + file_size = int(content_length) + if file_size > MAX_FILE_SIZE_QC: + file_size_mb = file_size / (1024 * 1024) + max_size_mb = MAX_FILE_SIZE_QC / (1024 * 1024) + raise QualityException( + f"Initial QC failed for '{url}'. File size is '{file_size_mb:.2f} MB', where max file size is set to: '{max_size_mb:.0f} MB'", + file_size=file_size, + ) + except requests.RequestException as err: + _LOGGER.warning(f"Unable to check file size: {err}. Continuing with QC...") + try: with urllib.request.urlopen(url) as response: with gzip.GzipFile(fileobj=response) as f: @@ -267,12 +289,13 @@ def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> bool: "Unable to read the file, initial QC failed, but continuing anyway..." f"Error: {str(err)}" ) - return False + return file_size if mean_width < min_region_width: raise QualityException( - f"Initial QC failed for '{url}'. Mean region width is '{mean_width}', where min region width is set to: '{min_region_width}'" + f"Initial QC failed for '{url}'. Mean region width is '{mean_width}', where min region width is set to: '{min_region_width}'", + file_size=file_size, ) _LOGGER.info(f"Initial QC passed for {url}") - return True + return file_size diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 1c930b9..31f7a3a 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -5,7 +5,7 @@ peppy>=0.40.7 yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.3 -bbconf>=0.14.2 +bbconf>=0.14.3 # bbconf @ git+https://github.com/databio/bbconf.git@comp_search#egg=bbconf refgenconf>=0.12.2 pandas>=2.0.0 diff --git a/scripts/all/qc_initial.py b/scripts/all/qc_initial.py index 2cf8f93..6ce6089 100644 --- a/scripts/all/qc_initial.py +++ b/scripts/all/qc_initial.py @@ -6,4 +6,5 @@ # url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8669nnn/GSM8669735/suppl/GSM8669735_ATAC-M1-1.narrowPeak.gz" # url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8669nnn/GSM8669759/suppl/GSM8669759_H3K27ac-M2-1.narrowPeak.gz" url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7163nnn/GSM7163568/suppl/GSM7163568_IRF3-ChIP-seq_HL_SVI_3h_exp1_2023_Total_peaks.bed.gz" - run_initial_qc(url) + file_size = run_initial_qc(url) + print(f"QC passed, File size: {file_size} bytes") diff --git a/scripts/bbuploader/main.py b/scripts/bbuploader/main.py index 46728d3..81e949b 100644 --- a/scripts/bbuploader/main.py +++ b/scripts/bbuploader/main.py @@ -57,8 +57,9 @@ def run_gse(): # gse="gse157732", # series + samples test # gse="gse209400", # series + samples test # gse="gse206280", - gse="gse113157", - geo_tag="samples", + # gse="gse113157", + gse="gse174226", # too big series file - shoiuld fail + geo_tag="series", bedbase_config="/home/bnt4me/virginia/repos/bedboss/config.yaml", outfolder="/home/bnt4me/virginia/repos/bbuploader/data", # genome="HG38",