Skip to content

Commit f29ee75

Browse files
committed
fix batch dedup: replace prefix truncation with SHA256 content hashing
Both normalizeForDedup (80-char) and normalizeContent (200-char) used naive prefix truncation that caused false-positive dedup matches on learnings with similar openings. Replace with SHA256 hash of the full normalized content. Add test coverage for both batch commands: dedup logic, promotion criteria, citation counting, transcript discovery, and content loading.
1 parent 627f40c commit f29ee75

File tree

4 files changed

+513
-16
lines changed

4 files changed

+513
-16
lines changed

cli/cmd/ao/batch_forge.go

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package main
22

33
import (
4+
"crypto/sha256"
5+
"encoding/hex"
46
"fmt"
57
"os"
68
"path/filepath"
@@ -238,18 +240,15 @@ func dedupSimilar(items []string) []string {
238240
return result
239241
}
240242

241-
// normalizeForDedup creates a normalized key for deduplication.
242-
// Lowercases, trims whitespace, and truncates to first 80 chars
243-
// to catch near-duplicate snippets that differ only in trailing content.
243+
// normalizeForDedup creates a normalized key for deduplication using content hashing.
244+
// Lowercases, collapses whitespace, strips ellipsis, then SHA256 hashes the full
245+
// normalized content. This avoids false positives from naive prefix truncation.
244246
func normalizeForDedup(s string) string {
245247
s = strings.ToLower(strings.TrimSpace(s))
246-
// Remove trailing ellipsis from snippets
247248
s = strings.TrimSuffix(s, "...")
248-
s = strings.TrimSpace(s)
249-
if len(s) > 80 {
250-
s = s[:80]
251-
}
252-
return s
249+
s = strings.Join(strings.Fields(s), " ")
250+
h := sha256.Sum256([]byte(s))
251+
return hex.EncodeToString(h[:])
253252
}
254253

255254
// humanSize returns a human-readable file size string.

cli/cmd/ao/batch_forge_test.go

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
package main
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
"time"
8+
)
9+
10+
func TestNormalizeForDedup(t *testing.T) {
11+
tests := []struct {
12+
name string
13+
a, b string
14+
wantSame bool
15+
}{
16+
{
17+
name: "exact duplicates",
18+
a: "Lead-only commit eliminates merge conflicts",
19+
b: "Lead-only commit eliminates merge conflicts",
20+
wantSame: true,
21+
},
22+
{
23+
name: "case difference",
24+
a: "Lead-Only Commit Pattern",
25+
b: "lead-only commit pattern",
26+
wantSame: true,
27+
},
28+
{
29+
name: "whitespace difference",
30+
a: " Lead-only commit pattern ",
31+
b: "Lead-only commit pattern",
32+
wantSame: true,
33+
},
34+
{
35+
name: "trailing ellipsis stripped",
36+
a: "Workers write files but never commit...",
37+
b: "Workers write files but never commit",
38+
wantSame: true,
39+
},
40+
{
41+
name: "distinct content with same 80-char prefix",
42+
a: "Topological wave decomposition extracts parallelism from dependency graphs by grouping leaves — this is about WAVES",
43+
b: "Topological wave decomposition extracts parallelism from dependency graphs by grouping leaves — this is about SORTING",
44+
wantSame: false,
45+
},
46+
{
47+
name: "short similar strings are distinct",
48+
a: "Use content hashing for dedup",
49+
b: "Use content hashing for dedup detection",
50+
wantSame: false,
51+
},
52+
{
53+
name: "completely different",
54+
a: "Workers should never commit",
55+
b: "Wave sizing follows dependency graph",
56+
wantSame: false,
57+
},
58+
}
59+
60+
for _, tt := range tests {
61+
t.Run(tt.name, func(t *testing.T) {
62+
keyA := normalizeForDedup(tt.a)
63+
keyB := normalizeForDedup(tt.b)
64+
gotSame := keyA == keyB
65+
if gotSame != tt.wantSame {
66+
t.Errorf("normalizeForDedup(%q) == normalizeForDedup(%q): got %v, want %v\n keyA=%s\n keyB=%s",
67+
tt.a, tt.b, gotSame, tt.wantSame, keyA, keyB)
68+
}
69+
})
70+
}
71+
}
72+
73+
func TestDedupSimilar(t *testing.T) {
74+
tests := []struct {
75+
name string
76+
input []string
77+
wantCount int
78+
}{
79+
{
80+
name: "nil input",
81+
input: nil,
82+
wantCount: 0,
83+
},
84+
{
85+
name: "empty input",
86+
input: []string{},
87+
wantCount: 0,
88+
},
89+
{
90+
name: "no duplicates",
91+
input: []string{"alpha", "beta", "gamma"},
92+
wantCount: 3,
93+
},
94+
{
95+
name: "exact duplicates removed",
96+
input: []string{"alpha", "beta", "alpha", "gamma", "beta"},
97+
wantCount: 3,
98+
},
99+
{
100+
name: "case-insensitive dedup",
101+
input: []string{
102+
"Lead-only commit pattern",
103+
"lead-only commit pattern",
104+
"LEAD-ONLY COMMIT PATTERN",
105+
},
106+
wantCount: 1,
107+
},
108+
{
109+
name: "preserves distinct long strings",
110+
input: []string{
111+
"Topological wave decomposition extracts parallelism from dependency graphs by grouping leaves — approach A",
112+
"Topological wave decomposition extracts parallelism from dependency graphs by grouping leaves — approach B",
113+
},
114+
wantCount: 2,
115+
},
116+
}
117+
118+
for _, tt := range tests {
119+
t.Run(tt.name, func(t *testing.T) {
120+
result := dedupSimilar(tt.input)
121+
if len(result) != tt.wantCount {
122+
t.Errorf("dedupSimilar() returned %d items, want %d. Items: %v", len(result), tt.wantCount, result)
123+
}
124+
})
125+
}
126+
}
127+
128+
func TestFindPendingTranscripts(t *testing.T) {
129+
// Create temp directory with test transcript files
130+
tmpDir, err := os.MkdirTemp("", "batch_forge_test")
131+
if err != nil {
132+
t.Fatal(err)
133+
}
134+
defer os.RemoveAll(tmpDir)
135+
136+
// Create some test JSONL files (must be > 100 bytes to pass the size filter)
137+
for _, name := range []string{"session1.jsonl", "session2.jsonl"} {
138+
content := []byte(`{"role":"user","content":"hello world, this is a test message with enough content"}` + "\n" +
139+
`{"role":"assistant","content":"hi there, this is a sufficiently long response to exceed the 100 byte minimum"}` + "\n")
140+
if err := os.WriteFile(filepath.Join(tmpDir, name), content, 0644); err != nil {
141+
t.Fatal(err)
142+
}
143+
}
144+
145+
// Create a file too small to be a transcript
146+
if err := os.WriteFile(filepath.Join(tmpDir, "tiny.jsonl"), []byte("{}"), 0644); err != nil {
147+
t.Fatal(err)
148+
}
149+
150+
// Create a non-JSONL file
151+
if err := os.WriteFile(filepath.Join(tmpDir, "readme.md"), []byte("# Hello"), 0644); err != nil {
152+
t.Fatal(err)
153+
}
154+
155+
// Create a subagents directory that should be skipped
156+
subDir := filepath.Join(tmpDir, "subagents")
157+
if err := os.MkdirAll(subDir, 0755); err != nil {
158+
t.Fatal(err)
159+
}
160+
if err := os.WriteFile(filepath.Join(subDir, "sub.jsonl"), []byte(`{"role":"user","content":"skip me please"}`+"\n"), 0644); err != nil {
161+
t.Fatal(err)
162+
}
163+
164+
candidates, err := findPendingTranscripts(tmpDir)
165+
if err != nil {
166+
t.Fatalf("findPendingTranscripts: %v", err)
167+
}
168+
169+
if len(candidates) != 2 {
170+
t.Errorf("got %d candidates, want 2", len(candidates))
171+
for _, c := range candidates {
172+
t.Logf(" candidate: %s (size=%d)", c.path, c.size)
173+
}
174+
}
175+
176+
// Verify sorted by mod time (oldest first)
177+
if len(candidates) >= 2 {
178+
if candidates[0].modTime.After(candidates[1].modTime) {
179+
t.Error("candidates not sorted by modification time (oldest first)")
180+
}
181+
}
182+
}
183+
184+
func TestFindPendingTranscriptsEmptyDir(t *testing.T) {
185+
tmpDir, err := os.MkdirTemp("", "batch_forge_empty")
186+
if err != nil {
187+
t.Fatal(err)
188+
}
189+
defer os.RemoveAll(tmpDir)
190+
191+
candidates, err := findPendingTranscripts(tmpDir)
192+
if err != nil {
193+
t.Fatalf("unexpected error: %v", err)
194+
}
195+
if len(candidates) != 0 {
196+
t.Errorf("got %d candidates, want 0", len(candidates))
197+
}
198+
}
199+
200+
func TestHumanSize(t *testing.T) {
201+
tests := []struct {
202+
bytes int64
203+
want string
204+
}{
205+
{0, "0 B"},
206+
{512, "512 B"},
207+
{1024, "1.0 KB"},
208+
{1536, "1.5 KB"},
209+
{1048576, "1.0 MB"},
210+
}
211+
212+
for _, tt := range tests {
213+
got := humanSize(tt.bytes)
214+
if got != tt.want {
215+
t.Errorf("humanSize(%d) = %q, want %q", tt.bytes, got, tt.want)
216+
}
217+
}
218+
}
219+
220+
func TestTranscriptCandidateFields(t *testing.T) {
221+
now := time.Now()
222+
c := transcriptCandidate{
223+
path: "/tmp/test.jsonl",
224+
modTime: now,
225+
size: 1234,
226+
}
227+
228+
if c.path != "/tmp/test.jsonl" {
229+
t.Error("path mismatch")
230+
}
231+
if c.size != 1234 {
232+
t.Error("size mismatch")
233+
}
234+
if !c.modTime.Equal(now) {
235+
t.Error("modTime mismatch")
236+
}
237+
}

cli/cmd/ao/batch_promote.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package main
22

33
import (
4+
"crypto/sha256"
5+
"encoding/hex"
46
"encoding/json"
57
"fmt"
68
"os"
@@ -235,16 +237,14 @@ func loadPromotedContent(baseDir string) map[string]bool {
235237
return content
236238
}
237239

238-
// normalizeContent creates a normalized key for content comparison.
239-
// Strips whitespace and lowercases for fuzzy duplicate detection.
240+
// normalizeContent creates a normalized key for content comparison using content hashing.
241+
// Lowercases, collapses whitespace, then SHA256 hashes the full normalized content.
242+
// This avoids false positives from naive prefix truncation.
240243
func normalizeContent(s string) string {
241244
s = strings.ToLower(s)
242245
s = strings.Join(strings.Fields(s), " ")
243-
// Use first 200 chars as fingerprint to avoid massive keys
244-
if len(s) > 200 {
245-
s = s[:200]
246-
}
247-
return s
246+
h := sha256.Sum256([]byte(s))
247+
return hex.EncodeToString(h[:])
248248
}
249249

250250
// outputBatchResult prints the batch-promote summary.

0 commit comments

Comments
 (0)