fix: use right tokenization and type IDs for bert style cross encoder

riccardopinosio · riccardopinosio · commit 581ae146d93e · 2025-09-10T09:15:29.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## [0.5.4] - 2025-09-10
+
+### Changed
+
+- Fix: use right tokenization and token type IDs for Bert-style sentence pair in cross encoder
+
 ## [0.5.3] - 2025-09-01
 
 ### Changed
diff --git a/hugot_test.go b/hugot_test.go
@@ -820,29 +820,50 @@ func crossEncoderPipeline(t *testing.T, session *Session) {
 	pipeline, err := NewPipeline(session, config)
 	checkT(t, err)
 
-	query := "What is the capital of France?"
+	query := "Organic skincare products for sensitive skin"
 	documents := []string{
-		"Paris is the capital of France.",
-		"The Eiffel Tower is in Paris.",
-		"France is a country in Europe.",
+		"Eco-friendly kitchenware for modern homes",
+		"Biodegradable cleaning supplies for eco-conscious consumers",
+		"Organic cotton baby clothes for sensitive skin",
+		"Natural organic skincare range for sensitive skin",
+		"Tech gadgets for smart homes: 2024 edition",
+		"Sustainable gardening tools and compost solutions",
+		"Sensitive skin-friendly facial cleansers and toners",
+		"Organic food wraps and storage solutions",
+		"All-natural pet food for dogs with allergies",
+		"Yoga mats made from recycled materials",
+	}
+
+	type Expected struct {
+		Document string
+		Score    float32
+	}
+
+	expectedRoberta := []Expected{
+		{Document: "Natural organic skincare range for sensitive skin", Score: 0.95478064},
+		{Document: "Organic cotton baby clothes for sensitive skin", Score: 0.8185698},
+		{Document: "Sensitive skin-friendly facial cleansers and toners", Score: 0.5848757},
+		{Document: "Organic food wraps and storage solutions", Score: 0.2567817},
+		{Document: "Biodegradable cleaning supplies for eco-conscious consumers", Score: 0.22029042},
+		{Document: "Yoga mats made from recycled materials", Score: 0.20082192},
+		{Document: "Sustainable gardening tools and compost solutions", Score: 0.19299757},
+		{Document: "All-natural pet food for dogs with allergies", Score: 0.18836288},
+		{Document: "Eco-friendly kitchenware for modern homes", Score: 0.18346606},
+		{Document: "Tech gadgets for smart homes: 2024 edition", Score: 0.16224432},
 	}
 
 	inputs := append([]string{query}, documents...)
 	output, err := pipeline.Run(inputs)
 	checkT(t, err)
-
 	results := output.(*pipelines.CrossEncoderOutput).Results
-	if len(results) != 3 {
-		t.Errorf("Expected 3 results, got %d", len(results))
-	}
-	if results[0].Document != "Paris is the capital of France." {
-		t.Errorf("Expected 'Paris is the capital of France.' as best document, got '%s'", results[0].Document)
-	}
-	if results[0].Score <= results[1].Score {
-		t.Errorf("Expected result 0 to have higher score than result 1, but got %f and %f", results[0].Score, results[1].Score)
-	}
-	if results[1].Score <= results[2].Score {
-		t.Errorf("Expected result 1 to have higher score than result 2, but got %f and %f", results[1].Score, results[2].Score)
+
+	for i, expected := range expectedRoberta {
+		if expected.Document != results[i].Document {
+			t.Errorf("Expected document '%s', got '%s'", expected.Document, results[i].Document)
+		}
+		if math.Abs(float64(expected.Score-results[i].Score)) > 0.01 {
+			t.Errorf("Expected score '%f', got '%f'", expected.Score, results[i].Score)
+		}
 	}
 }
 
diff --git a/pipelines/crossEncoder.go b/pipelines/crossEncoder.go
@@ -22,12 +22,11 @@ type CrossEncoderPipeline struct {
 }
 
 type CrossEncoderStats struct {
-	TotalQueries       uint64
-	TotalDocuments     uint64
-	AverageLatency     time.Duration
-	AverageBatchSize   float64
-	TruncatedSequences uint64
-	FilteredResults    uint64
+	TotalQueries     uint64
+	TotalDocuments   uint64
+	AverageLatency   time.Duration
+	AverageBatchSize float64
+	FilteredResults  uint64
 }
 
 type CrossEncoderResult struct {
@@ -122,7 +121,6 @@ func (p *CrossEncoderPipeline) GetStats() []string {
 		fmt.Sprintf("Total documents scored: %d", p.stats.TotalDocuments),
 		fmt.Sprintf("Average latency per query: %s", avgLatency),
 		fmt.Sprintf("Average batch size: %.2f", p.stats.AverageBatchSize),
-		fmt.Sprintf("Truncated sequences: %d", p.stats.TruncatedSequences),
 		fmt.Sprintf("Filtered results: %d", p.stats.FilteredResults),
 		fmt.Sprintf("Tokenizer: Total time=%s, Execution count=%d, Average query time=%s",
 			time.Duration(p.Model.Tokenizer.TokenizerTimings.TotalNS),
@@ -142,6 +140,14 @@ func (p *CrossEncoderPipeline) Validate() error {
 		validationErrors = append(validationErrors, fmt.Errorf("cross encoder pipeline requires a tokenizer"))
 	}
 
+	if p.Model.SeparatorToken == "" {
+		validationErrors = append(validationErrors, fmt.Errorf("cross encoder pipeline requires a separator token to be set in the model"))
+	}
+
+	if p.Model.SeparatorToken != "[SEP]" && p.Model.SeparatorToken != "</s>" {
+		validationErrors = append(validationErrors, fmt.Errorf("cross encoder pipeline only supports [SEP] (BERT) and </s> (Roberta) as separator tokens, got %s", p.Model.SeparatorToken))
+	}
+
 	outDims := p.Model.OutputsMeta[0].Dimensions
 	if len(outDims) != 2 {
 		validationErrors = append(validationErrors, fmt.Errorf("pipeline configuration invalid: cross encoder must have 2 dimensional output"))
@@ -167,16 +173,47 @@ func (p *CrossEncoderPipeline) Validate() error {
 	return errors.Join(validationErrors...)
 }
 
+func patchBertSequenceTokenTypeIDs(batch *pipelineBackends.PipelineBatch, sepToken string) {
+	// Fix token_type_ids for BERT-style models when we manually concatenated the pair as a single sequence.
+	// Pattern expected: [CLS] query [SEP] doc [SEP]
+	// HF sets token_type_ids=0 up to and including first [SEP], then 1 for remainder (including final [SEP]).
+	for index := range batch.Input {
+		input := &batch.Input[index]
+		// Only adjust if type ids exist and are all zero
+		allZero := true
+		for _, t := range input.TypeIDs {
+			if t != 0 {
+				allZero = false
+				break
+			}
+		}
+		if !allZero || len(input.TypeIDs) == 0 {
+			continue
+		}
+		// Find first [SEP] token index (skip position 0 which should be [CLS])
+		firstSep := -1
+		for iTok := 1; iTok < len(input.Tokens); iTok++ {
+			if input.Tokens[iTok] == sepToken {
+				firstSep = iTok
+				break
+			}
+		}
+		if firstSep == -1 || firstSep == len(input.Tokens)-1 { // nothing to split
+			continue
+		}
+		for iTok := firstSep + 1; iTok < len(input.TypeIDs); iTok++ {
+			input.TypeIDs[iTok] = 1
+		}
+	}
+}
+
 func (p *CrossEncoderPipeline) Preprocess(batch *pipelineBackends.PipelineBatch, inputs []string) error {
 	start := time.Now()
 
 	pipelineBackends.TokenizeInputs(batch, p.Model.Tokenizer, inputs)
 
-	// Track truncated sequences (tokenizer already handles truncation)
-	for _, tokenizedInput := range batch.Input {
-		if len(tokenizedInput.TokenIDs) >= p.Model.Tokenizer.MaxAllowedTokens {
-			atomic.AddUint64(&p.stats.TruncatedSequences, 1)
-		}
+	if p.Model != nil && p.Model.Tokenizer != nil && p.Model.SeparatorToken == "[SEP]" {
+		patchBertSequenceTokenTypeIDs(batch, p.Model.SeparatorToken)
 	}
 
 	atomic.AddUint64(&p.Model.Tokenizer.TokenizerTimings.NumCalls, 1)
@@ -289,8 +326,16 @@ func (p *CrossEncoderPipeline) runBatch(query string, documents []string, startI
 	var runErrors []error
 
 	inputs := make([]string, len(documents))
+	sep := p.Model.SeparatorToken
+
 	for i, doc := range documents {
-		inputs[i] = fmt.Sprintf("[CLS] %s [SEP] %s [SEP]", query, doc)
+		if sep == "</s>" {
+			// RoBERTa style: query </s> </s> document
+			inputs[i] = fmt.Sprintf("%s%s%s%s", query, sep, sep, doc)
+		} else {
+			// BERT style: query [SEP] document [SEP]
+			inputs[i] = fmt.Sprintf("%s%s%s", query, sep, doc)
+		}
 	}
 
 	batch := pipelineBackends.NewBatch(len(inputs))
@@ -300,6 +345,7 @@ func (p *CrossEncoderPipeline) runBatch(query string, documents []string, startI
 	}(batch)
 
 	runErrors = append(runErrors, p.Preprocess(batch, inputs))
+
 	if e := errors.Join(runErrors...); e != nil {
 		return nil, e
 	}
diff --git a/scripts/run-unit-tests-container.sh b/scripts/run-unit-tests-container.sh
@@ -22,9 +22,9 @@ echo "XLA tests completed."
 
 # echo "Running training tests..."
 
-# gotestsum --format testname --junitfile=$folder/unit-training.xml --jsonfile=$folder/unit-training.json -- -coverprofile=$folder/cover-training.out -coverpkg ./... -tags=ORT,XLA,TRAINING -timeout 60m
+gotestsum --format testname --junitfile=$folder/unit-training.xml --jsonfile=$folder/unit-training.json -- -coverprofile=$folder/cover-training.out -coverpkg ./... -tags=ORT,XLA,TRAINING -timeout 60m
 
-# echo "Training tests completed."
+echo "Training tests completed."
 
 # echo "Running simplego tests..."
 
@@ -36,7 +36,7 @@ echo "merging coverage files"
 head -n 1 $folder/cover-ort.out > $folder/cover.out
 tail -n +2 $folder/cover-ort.out >> $folder/cover.out
 tail -n +2 $folder/cover-xla.out >> $folder/cover.out
-# tail -n +2 $folder/cover-training.out >> $folder/cover.out
+tail -n +2 $folder/cover-training.out >> $folder/cover.out
 # tail -n +2 $folder/cover-go.out >> $folder/cover.out
 
 head -n 1 $folder/cover.out > $folder/cover.dedup.out
diff --git a/testData/downloadModels.go b/testData/downloadModels.go
@@ -57,12 +57,12 @@ func main() {
 					options := hugot.NewDownloadOptions()
 					options.OnnxFilePath = model.onnxFilePath
 					options.ExternalDataPath = model.externalDataPath
-					fmt.Println(fmt.Sprintf("Downloading %s", model.name))
+					fmt.Printf("Downloading %s\n", model.name)
 					outPath, dlErr := hugot.DownloadModel(model.name, "./models", options)
 					if dlErr != nil {
 						panic(dlErr)
 					}
-					fmt.Println(fmt.Sprintf("Downloaded %s to %s", model.name, outPath))
+					fmt.Printf("Downloaded %s to %s\n", model.name, outPath)
 				}
 			} else {
 				panic(err)