Skip to content

Commit 9943ce1

Browse files
authored
Merge pull request #644 from bwlang/adapter_dimer
adds specific output of number of adapter dimers observed
2 parents 88e901e + d4e1c64 commit 9943ce1

File tree

6 files changed

+50
-14
lines changed

6 files changed

+50
-14
lines changed

src/common.h

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,6 @@ typedef unsigned char uint8;
2424

2525
const char ATCG_BASES[] = {'A', 'T', 'C', 'G'};
2626

27-
#pragma pack(2)
28-
29-
30-
#pragma pack()
31-
32-
3327
// how many reads one pack has
3428
static const int PACK_SIZE = 256;
3529

@@ -50,6 +44,7 @@ static const int FAIL_LENGTH = 16;
5044
static const int FAIL_TOO_LONG = 17;
5145
static const int FAIL_QUALITY = 20;
5246
static const int FAIL_COMPLEXITY = 24;
47+
static const int FAIL_ADAPTER_DIMER = 28;
5348

5449
// how many types in total we support
5550
static const int FILTER_RESULT_TYPES = 32;
@@ -62,7 +57,7 @@ const static char* FAILED_TYPES[FILTER_RESULT_TYPES] = {
6257
"failed_too_short", "failed_too_long", "", "",
6358
"failed_quality_filter", "", "", "",
6459
"failed_low_complexity", "", "", "",
65-
"", "", "", ""
60+
"failed_adapter_dimer", "", "", ""
6661
};
6762

6863
#endif /* COMMON_H */

src/filterresult.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ long FilterResult::getTotalPolyXTrimmedBases() {
202202
}
203203

204204

205+
206+
205207
void FilterResult::print() {
206208
cerr << "reads passed filter: " << mFilterReadStats[PASS_FILTER] << endl;
207209
cerr << "reads failed due to low quality: " << mFilterReadStats[FAIL_QUALITY] << endl;
@@ -215,6 +217,7 @@ void FilterResult::print() {
215217
cerr << "reads failed due to low complexity: " << mFilterReadStats[FAIL_COMPLEXITY] << endl;
216218
}
217219
if(mOptions->adapter.enabled) {
220+
cerr << "reads failed due to adapter dimer: " << mFilterReadStats[FAIL_ADAPTER_DIMER] << endl;
218221
cerr << "reads with adapter trimmed: " << mTrimmedAdapterRead << endl;
219222
cerr << "bases trimmed due to adapters: " << mTrimmedAdapterBases << endl;
220223
}
@@ -240,6 +243,8 @@ void FilterResult::reportJson(ofstream& ofs, string padding) {
240243
ofs << padding << "\t" << "\"too_many_N_reads\": " << mFilterReadStats[FAIL_N_BASE] << "," << endl;
241244
if(mOptions->complexityFilter.enabled)
242245
ofs << padding << "\t" << "\"low_complexity_reads\": " << mFilterReadStats[FAIL_COMPLEXITY] << "," << endl;
246+
if(mOptions->adapter.enabled)
247+
ofs << padding << "\t" << "\"adapter_dimer_reads\": " << mFilterReadStats[FAIL_ADAPTER_DIMER] << "," << endl;
243248
ofs << padding << "\t" << "\"too_short_reads\": " << mFilterReadStats[FAIL_LENGTH] << "," << endl;
244249
ofs << padding << "\t" << "\"too_long_reads\": " << mFilterReadStats[FAIL_TOO_LONG] << endl;
245250

@@ -366,6 +371,8 @@ void FilterResult::reportHtml(ofstream& ofs, long totalReads, long totalBases) {
366371
}
367372
if(mOptions->complexityFilter.enabled)
368373
HtmlReporter::outputRow(ofs, "reads with low complexity:", HtmlReporter::formatNumber(mFilterReadStats[FAIL_COMPLEXITY]) + " (" + to_string(mFilterReadStats[FAIL_COMPLEXITY] * 100.0 / total) + "%)");
374+
if(mOptions->adapter.enabled)
375+
HtmlReporter::outputRow(ofs, "reads with adapter dimer:", HtmlReporter::formatNumber(mFilterReadStats[FAIL_ADAPTER_DIMER]) + " (" + to_string(mFilterReadStats[FAIL_ADAPTER_DIMER] * 100.0 / total) + "%)");
369376
ofs << "</table>\n";
370377
}
371378

src/main.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ int main(int argc, char* argv[]){
5959
cmd.add<string>("adapter_fasta", 0, "specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file", false, "");
6060
cmd.add("detect_adapter_for_pe", '2', "enable adapter detection for PE data to get ultra-clean data. It takes more time to find just a little bit more adapters.");
6161
cmd.add("allow_gap_overlap_trimming", 0, "allow up to one gap when trim adapters by overlap analysis for PE data. By default no gap is allowed.");
62+
cmd.add<int>("dimer_max_len", 0, "if the read length is less than or equal to this value after adapter trimming, it is considered an adapter dimer. Requires adapter evidence.", false, 2);
6263

6364
// trimming
6465
cmd.add<int>("trim_front1", 'f', "trimming how many bases in front for read1, default is 0", false, 0);
@@ -217,6 +218,7 @@ int main(int argc, char* argv[]){
217218
opt.adapter.enabled = !cmd.exist("disable_adapter_trimming");
218219
opt.adapter.detectAdapterForPE = cmd.exist("detect_adapter_for_pe");
219220
opt.adapter.allowGapOverlapTrimming = cmd.exist("allow_gap_overlap_trimming");
221+
opt.adapter.dimerMaxLen = cmd.get<int>("dimer_max_len");
220222
opt.adapter.sequence = cmd.get<string>("adapter_sequence");
221223
opt.adapter.sequenceR2 = cmd.get<string>("adapter_sequence_r2");
222224
opt.adapter.fastaFile = cmd.get<string>("adapter_fasta");

src/options.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ class AdapterOptions {
202202
hasSeqR2 = false;
203203
detectAdapterForPE = false;
204204
allowGapOverlapTrimming = false;
205+
dimerMaxLen = 2;
205206
}
206207
public:
207208
bool enabled;
@@ -216,6 +217,7 @@ class AdapterOptions {
216217
bool hasFasta;
217218
bool detectAdapterForPE;
218219
bool allowGapOverlapTrimming;
220+
int dimerMaxLen;
219221
};
220222

221223
class TrimmingOptions {

src/peprocessor.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,7 @@ bool PairEndProcessor::processPairEnd(ReadPack* leftPack, ReadPack* rightPack, T
430430
PolyX::trimPolyG(r1, r2, config->getFilterResult(), mOptions->polyGTrim.minLen);
431431
}
432432
bool isizeEvaluated = false;
433+
bool isAdapterDimer = false;
433434
if(r1 != NULL && r2!=NULL && (mOptions->adapter.enabled || mOptions->correction.enabled)){
434435
OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0, mOptions->adapter.allowGapOverlapTrimming);
435436
// we only use thread 0 to evaluae ISIZE
@@ -452,8 +453,16 @@ bool PairEndProcessor::processPairEnd(ReadPack* leftPack, ReadPack* rightPack, T
452453
trimmed2 = AdapterTrimmer::trimBySequence(r2, config->getFilterResult(), mOptions->adapter.sequenceR2, true);
453454
}
454455
if(mOptions->adapter.hasFasta) {
455-
AdapterTrimmer::trimByMultiSequences(r1, config->getFilterResult(), mOptions->adapter.seqsInFasta, false, !trimmed1);
456-
AdapterTrimmer::trimByMultiSequences(r2, config->getFilterResult(), mOptions->adapter.seqsInFasta, true, !trimmed2);
456+
trimmed1 |= AdapterTrimmer::trimByMultiSequences(r1, config->getFilterResult(), mOptions->adapter.seqsInFasta, false, !trimmed1);
457+
trimmed2 |= AdapterTrimmer::trimByMultiSequences(r2, config->getFilterResult(), mOptions->adapter.seqsInFasta, true, !trimmed2);
458+
}
459+
460+
// Check for adapter dimer: both reads shorter than threshold after adapter trimming
461+
// AND adapters were detected in at least one of the reads (requires evidence)
462+
if(r1 != NULL && r2 != NULL && (trimmed1 || trimmed2) &&
463+
r1->length() <= mOptions->adapter.dimerMaxLen &&
464+
r2->length() <= mOptions->adapter.dimerMaxLen) {
465+
isAdapterDimer = true;
457466
}
458467
}
459468
}
@@ -504,13 +513,19 @@ bool PairEndProcessor::processPairEnd(ReadPack* leftPack, ReadPack* rightPack, T
504513
mergeProcessed = true;
505514
} else if(mOptions->merge.includeUnmerged){
506515
int result1 = mFilter->passFilter(r1);
516+
int result2 = mFilter->passFilter(r2);
517+
518+
if(isAdapterDimer) {
519+
result1 = FAIL_ADAPTER_DIMER;
520+
result2 = FAIL_ADAPTER_DIMER;
521+
}
522+
507523
config->addFilterResult(result1, 1);
508524
if(result1 == PASS_FILTER && !dedupOut) {
509525
r1->appendToString(mergedOutput);
510526
config->getPostStats1()->statRead(r1);
511527
}
512528

513-
int result2 = mFilter->passFilter(r2);
514529
config->addFilterResult(result2, 1);
515530
if(result2 == PASS_FILTER && !dedupOut) {
516531
r2->appendToString(mergedOutput);
@@ -527,6 +542,11 @@ bool PairEndProcessor::processPairEnd(ReadPack* leftPack, ReadPack* rightPack, T
527542
int result1 = mFilter->passFilter(r1);
528543
int result2 = mFilter->passFilter(r2);
529544

545+
if(isAdapterDimer) {
546+
result1 = FAIL_ADAPTER_DIMER;
547+
result2 = FAIL_ADAPTER_DIMER;
548+
}
549+
530550
config->addFilterResult(max(result1, result2), 2);
531551

532552
if(!dedupOut) {

src/seprocessor.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){
225225
if(mOptions->fixMGI) {
226226
or1->fixMGI();
227227
}
228-
228+
229229
// umi processing
230230
if(mOptions->umi.enabled)
231231
mUmiProcessor->process(or1);
@@ -239,13 +239,20 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){
239239
PolyX::trimPolyG(r1, config->getFilterResult(), mOptions->polyGTrim.minLen);
240240
}
241241

242+
bool isAdapterDimer = false;
242243
if(r1 != NULL && mOptions->adapter.enabled){
243244
bool trimmed = false;
244245
if(mOptions->adapter.hasSeqR1)
245246
trimmed = AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, false);
246247
bool incTrimmedCounter = !trimmed;
247248
if(mOptions->adapter.hasFasta) {
248-
AdapterTrimmer::trimByMultiSequences(r1, config->getFilterResult(), mOptions->adapter.seqsInFasta, false, incTrimmedCounter);
249+
trimmed |= AdapterTrimmer::trimByMultiSequences(r1, config->getFilterResult(), mOptions->adapter.seqsInFasta, false, incTrimmedCounter);
250+
}
251+
252+
// Check for adapter dimer: read shorter than threshold after adapter trimming
253+
// AND adapter was detected (requires evidence)
254+
if(r1 != NULL && trimmed && r1->length() <= mOptions->adapter.dimerMaxLen) {
255+
isAdapterDimer = true;
249256
}
250257
}
251258

@@ -261,6 +268,9 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){
261268

262269
int result = mFilter->passFilter(r1);
263270

271+
if(isAdapterDimer)
272+
result = FAIL_ADAPTER_DIMER;
273+
264274
config->addFilterResult(result, 1);
265275

266276
if(!dedupOut) {
@@ -285,7 +295,7 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){
285295
// split output by each worker thread
286296
if(!mOptions->out1.empty())
287297
config->getWriter1()->writeString(outstr);
288-
}
298+
}
289299

290300
if(mLeftWriter) {
291301
mLeftWriter->input(tid, outstr);
@@ -438,7 +448,7 @@ void SingleEndProcessor::processorTask(ThreadConfig* config)
438448
usleep(100);
439449
}
440450
}
441-
input->setConsumerFinished();
451+
input->setConsumerFinished();
442452

443453
mFinishedThreads++;
444454
if(mFinishedThreads == mOptions->thread) {

0 commit comments

Comments
 (0)