1919
2020from bedboss .refgenome_validator .main import ReferenceValidator
2121from bedboss .exceptions import QualityException
22- from bedboss .const import MIN_REGION_WIDTH
22+ from bedboss .const import MIN_REGION_WIDTH , MAX_FILE_SIZE_QC
2323
2424_LOGGER = logging .getLogger ("bedboss" )
2525
@@ -242,18 +242,40 @@ def wrapper(*args, **kwargs):
242242 return wrapper
243243
244244
245- def run_initial_qc (url : str , min_region_width : int = MIN_REGION_WIDTH ) -> bool :
245+ def run_initial_qc (url : str , min_region_width : int = MIN_REGION_WIDTH ) -> int :
246246 """
247247 Run initial QC on the bed file
248248
249249 :param url: URL of the file
250250 :param min_region_width: Minimum region width threshold to pass the quality check. Default is 20
251251
252- :return: bool. Returns True if QC passed, False if unable to open in pandas
253- :raises: QualityException
252+ :return: int. File size in bytes (0 if unable to determine)
253+ :raises: QualityException (includes file_size attribute)
254254 """
255255 _LOGGER .info (f"Running initial QC on the bed file: { url } " )
256256
257+ file_size = 0
258+
259+ # Check file size before downloading content
260+ try :
261+ # Convert ftp:// to https:// for the HEAD request (e.g., NCBI FTP supports HTTPS)
262+ check_url = (
263+ url .replace ("ftp://" , "https://" ) if url .startswith ("ftp://" ) else url
264+ )
265+ response = requests .head (check_url , allow_redirects = True )
266+ content_length = response .headers .get ("Content-Length" )
267+ if content_length :
268+ file_size = int (content_length )
269+ if file_size > MAX_FILE_SIZE_QC :
270+ file_size_mb = file_size / (1024 * 1024 )
271+ max_size_mb = MAX_FILE_SIZE_QC / (1024 * 1024 )
272+ raise QualityException (
273+ f"Initial QC failed for '{ url } '. File size is '{ file_size_mb :.2f} MB', where max file size is set to: '{ max_size_mb :.0f} MB'" ,
274+ file_size = file_size ,
275+ )
276+ except requests .RequestException as err :
277+ _LOGGER .warning (f"Unable to check file size: { err } . Continuing with QC..." )
278+
257279 try :
258280 with urllib .request .urlopen (url ) as response :
259281 with gzip .GzipFile (fileobj = response ) as f :
@@ -267,12 +289,13 @@ def run_initial_qc(url: str, min_region_width: int = MIN_REGION_WIDTH) -> bool:
267289 "Unable to read the file, initial QC failed, but continuing anyway..."
268290 f"Error: { str (err )} "
269291 )
270- return False
292+ return file_size
271293
272294 if mean_width < min_region_width :
273295 raise QualityException (
274- f"Initial QC failed for '{ url } '. Mean region width is '{ mean_width } ', where min region width is set to: '{ min_region_width } '"
296+ f"Initial QC failed for '{ url } '. Mean region width is '{ mean_width } ', where min region width is set to: '{ min_region_width } '" ,
297+ file_size = file_size ,
275298 )
276299
277300 _LOGGER .info (f"Initial QC passed for { url } " )
278- return True
301+ return file_size
0 commit comments