Skip to content

Commit 74206cd

Browse files
authored
fix(ENGKNOW-3046): enable cram binding to correct reference genome for exome cloud bfx (#103)
* feat(ENGKNOW-3046): Enable multiple cram reference files. * feat(ENGKNOW-3046): Enable multiple cram reference files. * feat(ENGKNOW-3046): Enable multiple cram reference files. * feat(ENGKNOW-3046): Enable multiple cram reference files. * feat(ENGKNOW-3046): Enable multiple cram reference files. * feat(ENGKNOW-3046): Bump version * feat(ENGKNOW-3046): Enable multiple cram reference files.
1 parent d44ae4b commit 74206cd

File tree

6 files changed

+142
-101
lines changed

6 files changed

+142
-101
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
5.8.3
1+
5.8.4

gortools/src/test/java/gorsat/UTestCram.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,9 @@
3838
import java.io.IOException;
3939
import java.nio.charset.Charset;
4040
import java.nio.file.Paths;
41-
import java.util.List;
4241

4342
import static gorsat.TestUtils.LINE_SPLIT_PATTERN;
44-
import static org.gorpipe.gor.driver.providers.stream.datatypes.cram.CramIterator.KEY_REFERENCE_FORCE_FOLDER;
43+
import static org.gorpipe.gor.driver.providers.stream.datatypes.cram.CramIterator.KEY_REFERENCE_PREFER_FOLDER;
4544

4645
public class UTestCram {
4746

@@ -121,15 +120,15 @@ public void readCramWithFastaReferenceFromConfigException() throws IOException {
121120
try {
122121
TestUtils.runGorPipeCount(args);
123122
} catch (GorResourceException e) {
124-
Assert.assertTrue(e.getMessage().startsWith("Reference does not exist."));
123+
Assert.assertTrue(e.getMessage().startsWith("No cram reference found"));
125124
Assert.assertTrue(e.getUri().endsWith("cram_query_sorted2.fasta"));
126125
}
127126
}
128127

129128
@Test
130129
public void readCramWithFastaReferenceAndGenerateMissingAttributes() {
131130
System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA));
132-
System.setProperty(KEY_REFERENCE_FORCE_FOLDER, "false");
131+
System.setProperty(KEY_REFERENCE_PREFER_FOLDER, "false");
133132

134133
String[] args = new String[] {"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)};
135134

model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java

Lines changed: 73 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,11 @@ public class CramIterator extends BamIterator {
7070

7171
public final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes";
7272
public final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource";
73-
public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder.";
73+
public final static String KEY_REFERENCE_PREFER_FOLDER = "gor.driver.cram.reference.preferfolder";
7474

7575
private static final Logger log = LoggerFactory.getLogger(CramIterator.class);
7676

77-
private CramFile cramFile;
77+
private final CramFile cramFile;
7878
private int[] columns;
7979
ChromoLookup lookup;
8080
private String fileName;
@@ -164,7 +164,7 @@ public void init(GorSession session) {
164164

165165
fileName = cramFile.getFileSource().getSourceReference().getUrl();
166166

167-
referenceSource = createReferenceSource(getInitialReferenceFile(), session.getProjectContext().getRealProjectRoot());
167+
referenceSource = createReferenceSource(session.getProjectContext().getRealProjectRoot());
168168

169169
SeekableBufferedStream cramStream = new SeekableBufferedStream(new StreamSourceSeekableStream(cramFile.getFileSource()));
170170

@@ -191,14 +191,9 @@ public void init(GorSession session) {
191191
*
192192
* @return initial reference file
193193
*/
194-
private String getInitialReferenceFile() {
194+
private String getSourceReferenceFile() {
195195
StreamSource ref = cramFile.getReferenceSource();
196-
String referenceFileName = "";
197-
198-
if (ref != null) {
199-
referenceFileName = ref.getSourceReference().getUrl();
200-
}
201-
return referenceFileName;
196+
return ref != null ? ref.getSourceReference().getUrl() : null;
202197
}
203198

204199
private void closeReferenceFile() {
@@ -211,74 +206,99 @@ private void closeReferenceFile() {
211206
}
212207
}
213208

214-
private CRAMReferenceSource createReferenceSource(String ref, String root) {
209+
private CRAMReferenceSource createReferenceSource(String root) {
210+
File file = null;
211+
boolean forceFolder = false;
212+
213+
String sourceRef = getSourceReferenceFile();
214+
if (!Strings.isNullOrEmpty(sourceRef)) {
215+
file = new File(sourceRef);
216+
}
217+
218+
if (file == null) {
219+
file = getReferenceFromReferenceLinkFile();
220+
}
221+
222+
if (file == null) {
223+
file = getReferenceFromGorConfig(root);
224+
forceFolder = Boolean.parseBoolean(System.getProperty(KEY_REFERENCE_PREFER_FOLDER, "true"));
225+
}
215226

216-
File file = new File(ref);
217-
file = getReferenceFromReferenceLinkFile(file);
218-
file = getReferenceFromGorConfig(file, root);
219-
file = getReferenceFromGorOptions(file);
227+
if (file == null) {
228+
file = getReferenceFromGorOptions();
229+
forceFolder = Boolean.parseBoolean(System.getProperty(KEY_REFERENCE_PREFER_FOLDER, "true"));
220230

221-
if (!file.exists()) {
222-
throw new GorResourceException("Reference does not exist.", file.toString());
231+
}
232+
if (file == null || !file.exists()) {
233+
throw new GorResourceException("No cram reference found: %s".formatted(file), file != null ? file.getPath() : "null");
223234
}
224235

225236
// This reference should be fasta but we let the htsjdk library decide
226-
return createFileReference(file);
237+
return createFileReference(file, forceFolder);
227238
}
228239

229-
private File getReferenceFromGorOptions(File file) {
230-
if (!file.exists()) {
231-
String refPath = System.getProperty(KEY_FASTAREFERENCESOURCE, "");
240+
private File getReferenceFromGorOptions() {
241+
String refPath = System.getProperty(KEY_FASTAREFERENCESOURCE, "");
232242

233-
if (!StringUtils.isEmpty(refPath)) {
234-
return new File(refPath);
235-
}
243+
if (!StringUtils.isEmpty(refPath)) {
244+
return new File(refPath);
236245
}
237-
return file;
246+
247+
return null;
238248
}
239249

240-
private File getReferenceFromGorConfig(File file, String root) {
241-
if (!file.exists() && !Strings.isNullOrEmpty(projectCramReferencePath)) {
250+
private File getReferenceFromGorConfig(String root) {
251+
if (!Strings.isNullOrEmpty(projectCramReferencePath)) {
242252
return PathUtils.resolve(Paths.get(root), Paths.get(projectCramReferencePath)).toFile();
243253
}
244-
return file;
254+
return null;
245255
}
246256

247-
private File getReferenceFromReferenceLinkFile(File file) {
248-
if (!file.exists()) {
249-
File refLinkFile = new File(this.fileName + ".ref");
257+
private File getReferenceFromReferenceLinkFile() {
258+
File refLinkFile = new File(this.fileName + ".ref");
250259

251-
if (refLinkFile.exists()) {
252-
try {
253-
List<String> lines = FileUtils.readLines(refLinkFile, Charset.defaultCharset());
260+
if (refLinkFile.exists()) {
261+
try {
262+
List<String> lines = FileUtils.readLines(refLinkFile, Charset.defaultCharset());
254263

255-
if (lines.size() > 0) {
256-
return new File(lines.get(0));
257-
}
258-
} catch (IOException e) {
259-
/*Do Nothing*/
264+
if (lines.size() > 0) {
265+
return new File(lines.get(0));
260266
}
267+
} catch (IOException e) {
268+
/*Do Nothing*/
261269
}
262270
}
263-
return file;
271+
272+
return null;
264273
}
265274

266-
private CRAMReferenceSource createFileReference(File refFile) {
275+
private CRAMReferenceSource createFileReference(File refFile, boolean preferFolder) {
267276
if (refFile.isDirectory()) {
268-
return new CompositeReferenceSource(List.of(
269-
new FolderReferenceSource(refFile.getPath()),
270-
new EBIReferenceSource(refFile.getPath())));
271-
} else if (Boolean.getBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) {
272-
return new CompositeReferenceSource(List.of(
273-
new FolderReferenceSource(refFile.getParent()),
274-
new EBIReferenceSource(refFile.getParent())));
277+
return createCompositeReferenceSource(refFile);
278+
} else if (preferFolder) {
279+
try {
280+
return createCompositeReferenceSource(refFile.getParentFile());
281+
} catch (Exception e) {
282+
// Fallback to single file, in case none of the files contains proper meta.
283+
return createSharedFastaReferenceSource(refFile);
284+
}
275285
} else {
276-
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
277-
278-
String referenceKey = FilenameUtils.removeExtension(refFile.getName());
279-
var referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
280-
return new SharedFastaReferenceSource(referenceFile, referenceKey);
286+
return createSharedFastaReferenceSource(refFile);
281287
}
282288
}
283289

290+
private CRAMReferenceSource createSharedFastaReferenceSource(File refFile) {
291+
log.debug("Using fasta reference file for CRAM: {}", refFile.getPath());
292+
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
293+
294+
String referenceKey = FilenameUtils.removeExtension(refFile.getName());
295+
var referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
296+
return new SharedFastaReferenceSource(referenceFile, referenceKey);
297+
}
298+
private CRAMReferenceSource createCompositeReferenceSource(File refFolder) {
299+
log.debug("Using folder reference for CRAM: {}", refFolder.getPath());
300+
return new CompositeReferenceSource(List.of(
301+
new FolderReferenceSource(refFolder.getPath()),
302+
new EBIReferenceSource(refFolder.getPath())));
303+
}
284304
}

model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java

Lines changed: 44 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,22 @@
22

33
import htsjdk.samtools.Defaults;
44
import htsjdk.samtools.SAMSequenceRecord;
5+
import htsjdk.samtools.cram.io.InputStreamUtils;
6+
import htsjdk.samtools.util.SequenceUtil;
57
import org.gorpipe.exceptions.GorDataException;
68
import org.gorpipe.exceptions.GorResourceException;
79
import org.gorpipe.gor.table.util.PathUtils;
810
import org.gorpipe.util.Strings;
911
import org.slf4j.Logger;
1012
import org.slf4j.LoggerFactory;
1113

12-
import java.io.BufferedInputStream;
1314
import java.io.IOException;
14-
import java.net.HttpURLConnection;
15+
import java.io.InputStream;
1516
import java.net.URL;
1617
import java.nio.file.Files;
1718
import java.nio.file.Path;
1819
import java.util.HashSet;
20+
import java.util.Locale;
1921
import java.util.Map;
2022
import java.util.Set;
2123
import java.util.concurrent.ConcurrentHashMap;
@@ -34,6 +36,8 @@ public class EBIReferenceSource extends MD5CachedReferenceSource {
3436

3537
protected static Map<String, Path> md5ToRefbases = new ConcurrentHashMap<>();
3638

39+
private static final int DOWNLOAD_TRIES_BEFORE_FAILING = 2;
40+
3741
private Path referenceFolder; // If null we do not download.
3842

3943
public EBIReferenceSource() {
@@ -81,8 +85,10 @@ private void processRefbasesFile(Path refbases) {
8185

8286
@Override
8387
protected byte[] loadReference(final SAMSequenceRecord record) {
88+
var md5 = record.getMd5();
89+
8490
// Load from refbases file.
85-
Path refbasesPath = md5ToRefbases.get(record.getMd5());
91+
Path refbasesPath = md5ToRefbases.get(md5);
8692
if (refbasesPath != null) {
8793
try {
8894
byte[] bases = Files.readAllBytes(refbasesPath);
@@ -96,49 +102,55 @@ protected byte[] loadReference(final SAMSequenceRecord record) {
96102
}
97103

98104
// Load from EBI service.
99-
if (Boolean.parseBoolean(System.getProperty(KEY_USE_CRAM_REF_DOWNLOAD,
100-
Boolean.toString(Defaults.USE_CRAM_REF_DOWNLOAD)))) {
105+
if (Boolean.parseBoolean(System.getProperty(KEY_USE_CRAM_REF_DOWNLOAD, "True"))) {
101106
try {
102-
103-
byte[] bases = downloadFromEBI(record.getMd5());
104-
if (bases != EMPTY_BASES) {
105-
saveRefbasesToDisk(record.getMd5(), bases);
107+
// Just use mem, this is going into mem cache anyway.
108+
byte[] bases = downloadFromEBI(md5);
109+
if (bases != null) {
110+
saveRefbasesToDisk(md5, bases);
106111
}
107112
return bases;
108-
} catch (IOException e) {
109-
log.warn("Could not download/save reference sequence for md5 " + record.getMd5(), e);
113+
} catch (Exception e) {
114+
log.warn("Could not download/save reference sequence for md5 " + md5, e);
110115
}
111116
}
112117

113-
return EMPTY_BASES;
118+
return null;
114119
}
115120

116121
/**
117122
* Download reference sequence from EBI by MD5 and store it in the reference folder.
118123
* @param md5
119124
* @return bytes of the reference sequence, null if not found.
120-
* @throws IOException
125+
* @throws IOException if the sequence is not found or the download fails.
121126
*/
122-
private byte[] downloadFromEBI(String md5) throws IOException {
123-
log.info("Downloading reference {} from ENA", md5);
124-
URL url = new URL(String.format(Defaults.EBI_REFERENCE_SERVICE_URL_MASK, md5));
125-
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
126-
conn.setConnectTimeout(15000);
127-
conn.setReadTimeout(30000);
128-
conn.setRequestMethod("GET");
129-
130-
if (conn.getResponseCode() != 200) {
131-
log.warn("ENA returned {} for {}", conn.getResponseCode(), md5);
132-
return EMPTY_BASES;
133-
}
134-
135-
byte[] bases;
136-
try (BufferedInputStream in = new BufferedInputStream(conn.getInputStream())) {
137-
bases = in.readAllBytes();
127+
private byte[] downloadFromEBI(final String md5) throws IOException {
128+
final String url = String.format(Locale.US, Defaults.EBI_REFERENCE_SERVICE_URL_MASK, md5);
129+
130+
for (int i = 0; i < DOWNLOAD_TRIES_BEFORE_FAILING; i++) {
131+
try (final InputStream is = new URL(url).openStream()) {
132+
if (is == null)
133+
return null;
134+
135+
log.info("Downloading reference sequence: {}", url);
136+
final byte[] bases = InputStreamUtils.readFully(is);
137+
log.info("Downloaded {} bytes for md5 {}", bases.length, md5);
138+
139+
final String downloadedMD5 = SequenceUtil.calculateMD5String(bases);
140+
if (md5.equals(downloadedMD5)) {
141+
return bases;
142+
} else {
143+
log.error("Downloaded sequence is corrupt: requested md5={}, received md5={}",
144+
md5, downloadedMD5);
145+
}
146+
return bases;
147+
}
148+
catch (final IOException e) {
149+
log.warn("Failed to download reference sequence for md5 {} on try {}/{}",
150+
md5, (i + 1), DOWNLOAD_TRIES_BEFORE_FAILING, e);
151+
}
138152
}
139-
if (bases.length == 0) return EMPTY_BASES;
140-
141-
return bases;
153+
throw new IOException("Giving up on downloading sequence for md5 %s".formatted(md5));
142154
}
143155

144156
private void saveRefbasesToDisk(String md5, byte[] bases) throws IOException {

model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ protected byte[] loadReference(final SAMSequenceRecord record) {
7777
return rsFile.getSequence(referencePath.contig()).getBases();
7878
}
7979

80-
return EMPTY_BASES;
80+
return null;
8181
}
8282

8383
private void scanReferenceFolder() {

0 commit comments

Comments
 (0)