Skip to content

Commit 289fcb9

Browse files
Merge pull request #147 from databio/dev
Release 0.9.2
2 parents fe2b233 + 9af957e commit 289fcb9

File tree

8 files changed

+51
-14
lines changed

8 files changed

+51
-14
lines changed

bedboss/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.9.1"
1+
__version__ = "0.9.2"

bedboss/bbuploader/main.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
)
3535
from bedboss.bedboss import run_all
3636
from bedboss.bedbuncher.bedbuncher import run_bedbuncher
37+
from bedboss.const import MAX_FILE_SIZE
3738
from bedboss.exceptions import BedBossException, QualityException
3839
from bedboss.skipper import Skipper
3940
from bedboss.refgenome_validator.main import ReferenceValidator
@@ -693,11 +694,19 @@ def _upload_gse(
693694
)
694695

695696
# to speed up the process, we can run initial QC on the file
696-
run_initial_qc(project_sample.file_url)
697+
qc_file_size = run_initial_qc(project_sample.file_url)
698+
if qc_file_size > 0:
699+
sample_status.file_size = min(
700+
qc_file_size, MAX_FILE_SIZE
701+
) # we need to limit file size to MAX_FILE_SIZE for DB storage
697702
except QualityException as err:
698703
_LOGGER.error(f"Processing of '{sample_gsm}' failed with error: {str(err)}")
699704
sample_status.status = STATUS.FAIL
700705
sample_status.error = str(err)
706+
if err.file_size > 0:
707+
sample_status.file_size = min(
708+
err.file_size, MAX_FILE_SIZE
709+
) # we need to limit file size to MAX_FILE_SIZE for DB storage
701710
project_status.number_of_failed += 1
702711

703712
if skipper_obj:

bedboss/const.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
# bedqc
2525
MAX_FILE_SIZE = 1024 * 1024 * 1024 * 2
26+
MAX_FILE_SIZE_QC = 1024 * 1024 * 25 # 25 MB
2627
MAX_REGION_NUMBER = 5000000
2728
MIN_REGION_WIDTH = 10
2829

bedboss/exceptions.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,15 @@ def __init__(self, reason: str = ""):
2727
class QualityException(BedBossException):
2828
"""Exception, when quality test of the bed file didn't pass."""
2929

30-
def __init__(self, reason: str = ""):
30+
def __init__(self, reason: str = "", file_size: int = 0):
3131
"""
3232
Optionally provide explanation for exceptional condition.
3333
3434
:param str reason: reason why quality control wasn't successful
35+
:param int file_size: file size in bytes (if available)
3536
"""
3637
self.reason = reason
38+
self.file_size = file_size
3739
super(QualityException, self).__init__(reason)
3840

3941

bedboss/utils.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from bedboss.refgenome_validator.main import ReferenceValidator
2121
from bedboss.exceptions import QualityException
22-
from bedboss.const import MIN_REGION_WIDTH
22+
from bedboss.const import MIN_REGION_WIDTH, MAX_FILE_SIZE_QC
2323

2424
_LOGGER = logging.getLogger("bedboss")
2525

@@ -242,18 +242,40 @@ def wrapper(*args, **kwargs):
242242
return wrapper
243243

244244

245-
def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> bool:
245+
def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> int:
246246
"""
247247
Run initial QC on the bed file
248248
249249
:param url: URL of the file
250250
:param min_region_width: Minimum region width threshold to pass the quality check. Default is 20
251251
252-
:return: bool. Returns True if QC passed, False if unable to open in pandas
253-
:raises: QualityException
252+
:return: int. File size in bytes (0 if unable to determine)
253+
:raises: QualityException (includes file_size attribute)
254254
"""
255255
_LOGGER.info(f"Running initial QC on the bed file: {url}")
256256

257+
file_size = 0
258+
259+
# Check file size before downloading content
260+
try:
261+
# Convert ftp:// to https:// for the HEAD request (e.g., NCBI FTP supports HTTPS)
262+
check_url = (
263+
url.replace("ftp://", "https://") if url.startswith("ftp://") else url
264+
)
265+
response = requests.head(check_url, allow_redirects=True)
266+
content_length = response.headers.get("Content-Length")
267+
if content_length:
268+
file_size = int(content_length)
269+
if file_size > MAX_FILE_SIZE_QC:
270+
file_size_mb = file_size / (1024 * 1024)
271+
max_size_mb = MAX_FILE_SIZE_QC / (1024 * 1024)
272+
raise QualityException(
273+
f"Initial QC failed for '{url}'. File size is '{file_size_mb:.2f} MB', where max file size is set to: '{max_size_mb:.0f} MB'",
274+
file_size=file_size,
275+
)
276+
except requests.RequestException as err:
277+
_LOGGER.warning(f"Unable to check file size: {err}. Continuing with QC...")
278+
257279
try:
258280
with urllib.request.urlopen(url) as response:
259281
with gzip.GzipFile(fileobj=response) as f:
@@ -267,12 +289,13 @@ def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> bool:
267289
"Unable to read the file, initial QC failed, but continuing anyway..."
268290
f"Error: {str(err)}"
269291
)
270-
return False
292+
return file_size
271293

272294
if mean_width < min_region_width:
273295
raise QualityException(
274-
f"Initial QC failed for '{url}'. Mean region width is '{mean_width}', where min region width is set to: '{min_region_width}'"
296+
f"Initial QC failed for '{url}'. Mean region width is '{mean_width}', where min region width is set to: '{min_region_width}'",
297+
file_size=file_size,
275298
)
276299

277300
_LOGGER.info(f"Initial QC passed for {url}")
278-
return True
301+
return file_size

requirements/requirements-all.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ peppy>=0.40.7
55
yacman>=0.8.4
66
requests>=2.28.2
77
piper>=v0.14.3
8-
bbconf>=0.14.2
8+
bbconf>=0.14.3
99
# bbconf @ git+https://github.com/databio/bbconf.git@comp_search#egg=bbconf
1010
refgenconf>=0.12.2
1111
pandas>=2.0.0

scripts/all/qc_initial.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@
66
# url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8669nnn/GSM8669735/suppl/GSM8669735_ATAC-M1-1.narrowPeak.gz"
77
# url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8669nnn/GSM8669759/suppl/GSM8669759_H3K27ac-M2-1.narrowPeak.gz"
88
url = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7163nnn/GSM7163568/suppl/GSM7163568_IRF3-ChIP-seq_HL_SVI_3h_exp1_2023_Total_peaks.bed.gz"
9-
run_initial_qc(url)
9+
file_size = run_initial_qc(url)
10+
print(f"QC passed, File size: {file_size} bytes")

scripts/bbuploader/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ def run_gse():
5757
# gse="gse157732", # series + samples test
5858
# gse="gse209400", # series + samples test
5959
# gse="gse206280",
60-
gse="gse113157",
61-
geo_tag="samples",
60+
# gse="gse113157",
61+
gse="gse174226", # too big series file - shoiuld fail
62+
geo_tag="series",
6263
bedbase_config="/home/bnt4me/virginia/repos/bedboss/config.yaml",
6364
outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
6465
# genome="HG38",

0 commit comments

Comments
 (0)