Skip to content

Commit 9b841f5

Browse files
committed
The Burrows-Wheeler Transform 块排序压缩算法
1 parent 550d159 commit 9b841f5

File tree

1 file changed

+342
-0
lines changed

1 file changed

+342
-0
lines changed
Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
---
2+
title: The Burrows-Wheeler Transform 块排序压缩算法
3+
date: 2026-01-01 22:11:23
4+
updated: 2026-01-01 22:11:23
5+
categories:
6+
- 杂项
7+
tag:
8+
- 算法
9+
math: true
10+
description: 一种非常有趣的块排序压缩算法
11+
hide: false
12+
sticky: false
13+
---
14+
15+
# 前言
16+
17+
最近发现了一个非常有意思的算法:The Burrows-Wheeler Transform(块排序压缩算法),所以稍微花点时间简单记录一下
18+
19+
说是一个压缩算法,实际上这个算法主要做的事情是把数据重排序,实际上并没有减少数据的长度(通常反而因为增加了行尾标识而增长了)
20+
21+
但是它完成了一个非常有意思的效果:在几乎不增加字符串长度的情况下,把一个字符串的重复的部分尽可能聚合到一块了
22+
23+
而 Gzip 压缩算法基于Deflate算法,通过结合LZ77算法(查找并替换重复字符串)和霍夫曼编码,通过将接近的字符串聚合到一块,能够有效增加压缩率
24+
25+
# 算法操作方式
26+
27+
我做了一个简单的演示工具,我们就用经典的 `banana` 作为示例
28+
29+
<div id="v">
30+
<div class="demo-block">
31+
<div>在原串后添加结束结束符号<span class="demo-mono red">$</span>,且此符号认为是最小的字符</div>
32+
<div class="demo-window">
33+
banana<span class="red">\$</span>
34+
</div>
35+
</div>
36+
<div class="demo-block">
37+
<div>生成字符串的全部循环序列</div>
38+
<div class="demo-window">
39+
banana<span class="red">\$</span><br>
40+
anana<span class="red">\$</span>b<br>
41+
nana<span class="red">\$</span>ba<br>
42+
ana<span class="red">\$</span>ban<br>
43+
na<span class="red">\$</span>bana<br>
44+
a<span class="red">\$</span>banan<br>
45+
<span class="red">\$</span>banana
46+
</div>
47+
</div>
48+
<div class="demo-block">
49+
<div>将这几个字符串排序</div>
50+
<div class="demo-window">
51+
<span class="red">\$</span>banana<br>
52+
a<span class="red">\$</span>banan<br>
53+
ana<span class="red">\$</span>ban<br>
54+
anana<span class="red">\$</span>b<br>
55+
banana<span class="red">\$</span><br>
56+
na<span class="red">\$</span>bana<br>
57+
nana<span class="red">\$</span>ba
58+
</div>
59+
</div>
60+
<div class="demo-block">
61+
<div>取出最后一列字符串</div>
62+
<div class="demo-window">
63+
<span class="red">\$</span>banan<span class="green">a</span><br>
64+
a<span class="red">\$</span>bana<span class="green">n</span><br>
65+
ana<span class="red">\$</span>ba<span class="green">n</span><br>
66+
anana<span class="red">\$</span><span class="green">b</span><br>
67+
banana<span class="green">\$</span><br>
68+
na<span class="red">\$</span>ban<span class="green">a</span><br>
69+
nana<span class="red">\$</span>b<span class="green">a</span>
70+
</div>
71+
</div>
72+
<div class="demo-block">
73+
<div>得到结果</div>
74+
<div class="demo-window">
75+
annb<span class="red">\$</span>aa
76+
</div>
77+
</div>
78+
<div class="demo-block">
79+
<div>下面将进行还原操作</div>
80+
<div class="demo-window">
81+
annb<span class="red">\$</span>aa
82+
</div>
83+
</div>
84+
<div class="demo-block">
85+
<div>将结果排成一列</div>
86+
<div class="demo-window">
87+
<span class="green">a</span><br>
88+
<span class="green">n</span><br>
89+
<span class="green">n</span><br>
90+
<span class="green">b</span><br>
91+
<span class="green">\$</span><br>
92+
<span class="green">a</span><br>
93+
<span class="green">a</span>
94+
</div>
95+
</div>
96+
<div class="demo-block">
97+
<div>排序</div>
98+
<div class="demo-window">
99+
<span class="green">\$</span><br>
100+
<span class="green">a</span><br>
101+
<span class="green">a</span><br>
102+
<span class="green">a</span><br>
103+
<span class="green">b</span><br>
104+
<span class="green">n</span><br>
105+
<span class="green">n</span>
106+
</div>
107+
</div>
108+
<div class="demo-block">
109+
<div>在当前的列之前添加 BWT 结果</div>
110+
<div class="demo-window">
111+
<span class="green">a</span><span class="gray">\$</span><br>
112+
<span class="green">n</span><span class="gray">a</span><br>
113+
<span class="green">n</span><span class="gray">a</span><br>
114+
<span class="green">b</span><span class="gray">a</span><br>
115+
<span class="green">\$</span><span class="gray">b</span><br>
116+
<span class="green">a</span><span class="gray">n</span><br>
117+
<span class="green">a</span><span class="gray">n</span>
118+
</div>
119+
</div>
120+
<div class="demo-block">
121+
<div>再次排序</div>
122+
<div class="demo-window">
123+
<span class="green">\$</span><span class="gray">b</span><br>
124+
<span class="green">a</span><span class="gray">\$</span><br>
125+
<span class="green">a</span><span class="gray">n</span><br>
126+
<span class="green">a</span><span class="gray">n</span><br>
127+
<span class="green">b</span><span class="gray">a</span><br>
128+
<span class="green">n</span><span class="gray">a</span><br>
129+
<span class="green">n</span><span class="gray">a</span>
130+
</div>
131+
</div>
132+
<div class="demo-block">
133+
<div>重复上述步骤</div>
134+
<div class="demo-window">
135+
<span class="green">a</span><span class="gray">\$b</span><br>
136+
<span class="green">n</span><span class="gray">a\$</span><br>
137+
<span class="green">n</span><span class="gray">an</span><br>
138+
<span class="green">b</span><span class="gray">an</span><br>
139+
<span class="green">\$</span><span class="gray">ba</span><br>
140+
<span class="green">a</span><span class="gray">na</span><br>
141+
<span class="green">a</span><span class="gray">na</span>
142+
</div>
143+
</div>
144+
<div class="demo-block">
145+
<div>再次排序</div>
146+
<div class="demo-window">
147+
<span class="green">\$</span><span class="gray">ba</span><br>
148+
<span class="green">a</span><span class="gray">\$b</span><br>
149+
<span class="green">a</span><span class="gray">na</span><br>
150+
<span class="green">a</span><span class="gray">na</span><br>
151+
<span class="green">b</span><span class="gray">an</span><br>
152+
<span class="green">n</span><span class="gray">a\$</span><br>
153+
<span class="green">n</span><span class="gray">an</span>
154+
</div>
155+
</div>
156+
<div class="demo-block">
157+
<div>重复上述步骤</div>
158+
<div class="demo-window">
159+
<span class="green">a</span><span class="gray">\$ba</span><br>
160+
<span class="green">n</span><span class="gray">a\$b</span><br>
161+
<span class="green">n</span><span class="gray">ana</span><br>
162+
<span class="green">b</span><span class="gray">ana</span><br>
163+
<span class="green">\$</span><span class="gray">ban</span><br>
164+
<span class="green">a</span><span class="gray">na\$</span><br>
165+
<span class="green">a</span><span class="gray">nan</span>
166+
</div>
167+
</div>
168+
<div class="demo-block">
169+
<div>再次排序</div>
170+
<div class="demo-window">
171+
<span class="green">\$</span><span class="gray">ban</span><br>
172+
<span class="green">a</span><span class="gray">\$ba</span><br>
173+
<span class="green">a</span><span class="gray">na\$</span><br>
174+
<span class="green">a</span><span class="gray">nan</span><br>
175+
<span class="green">b</span><span class="gray">ana</span><br>
176+
<span class="green">n</span><span class="gray">a\$b</span><br>
177+
<span class="green">n</span><span class="gray">ana</span>
178+
</div>
179+
</div>
180+
<div class="demo-block">
181+
<div>重复上述步骤</div>
182+
<div class="demo-window">
183+
<span class="green">a</span><span class="gray">\$ban</span><br>
184+
<span class="green">n</span><span class="gray">a\$ba</span><br>
185+
<span class="green">n</span><span class="gray">ana\$</span><br>
186+
<span class="green">b</span><span class="gray">anan</span><br>
187+
<span class="green">\$</span><span class="gray">bana</span><br>
188+
<span class="green">a</span><span class="gray">na\$b</span><br>
189+
<span class="green">a</span><span class="gray">nana</span>
190+
</div>
191+
</div>
192+
<div class="demo-block">
193+
<div>再次排序</div>
194+
<div class="demo-window">
195+
<span class="green">\$</span><span class="gray">bana</span><br>
196+
<span class="green">a</span><span class="gray">\$ban</span><br>
197+
<span class="green">a</span><span class="gray">na\$b</span><br>
198+
<span class="green">a</span><span class="gray">nana</span><br>
199+
<span class="green">b</span><span class="gray">anan</span><br>
200+
<span class="green">n</span><span class="gray">a\$ba</span><br>
201+
<span class="green">n</span><span class="gray">ana\$</span>
202+
</div>
203+
</div>
204+
<div class="demo-block">
205+
<div>重复上述步骤</div>
206+
<div class="demo-window">
207+
<span class="green">a</span><span class="gray">\$bana</span><br>
208+
<span class="green">n</span><span class="gray">a\$ban</span><br>
209+
<span class="green">n</span><span class="gray">ana\$b</span><br>
210+
<span class="green">b</span><span class="gray">anana</span><br>
211+
<span class="green">\$</span><span class="gray">banan</span><br>
212+
<span class="green">a</span><span class="gray">na\$ba</span><br>
213+
<span class="green">a</span><span class="gray">nana\$</span>
214+
</div>
215+
</div>
216+
<div class="demo-block">
217+
<div>再次排序</div>
218+
<div class="demo-window">
219+
<span class="green">\$</span><span class="gray">banan</span><br>
220+
<span class="green">a</span><span class="gray">\$bana</span><br>
221+
<span class="green">a</span><span class="gray">na\$ba</span><br>
222+
<span class="green">a</span><span class="gray">nana\$</span><br>
223+
<span class="green">b</span><span class="gray">anana</span><br>
224+
<span class="green">n</span><span class="gray">a\$ban</span><br>
225+
<span class="green">n</span><span class="gray">ana\$b</span>
226+
</div>
227+
</div>
228+
<div class="demo-block">
229+
<div>重复上述步骤</div>
230+
<div class="demo-window">
231+
<span class="green">a</span><span class="gray">\$banan</span><br>
232+
<span class="green">n</span><span class="gray">a\$bana</span><br>
233+
<span class="green">n</span><span class="gray">ana\$ba</span><br>
234+
<span class="green">b</span><span class="gray">anana\$</span><br>
235+
<span class="green">\$</span><span class="gray">banana</span><br>
236+
<span class="green">a</span><span class="gray">na\$ban</span><br>
237+
<span class="green">a</span><span class="gray">nana\$b</span>
238+
</div>
239+
</div>
240+
<div class="demo-block">
241+
<div>再次排序</div>
242+
<div class="demo-window">
243+
<span class="green">\$</span><span class="gray">banana</span><br>
244+
<span class="green">a</span><span class="gray">\$banan</span><br>
245+
<span class="green">a</span><span class="gray">na\$ban</span><br>
246+
<span class="green">a</span><span class="gray">nana\$b</span><br>
247+
<span class="green">b</span><span class="gray">anana\$</span><br>
248+
<span class="green">n</span><span class="gray">a\$bana</span><br>
249+
<span class="green">n</span><span class="gray">ana\$ba</span>
250+
</div>
251+
</div>
252+
<div class="demo-block">
253+
<div>回到最初的矩阵</div>
254+
<div class="demo-window">
255+
<span class="red">\$</span>banana<br>
256+
a<span class="red">\$</span>banan<br>
257+
ana<span class="red">\$</span>ban<br>
258+
anana<span class="red">\$</span>b<br>
259+
banana<span class="red">\$</span><br>
260+
na<span class="red">\$</span>bana<br>
261+
nana<span class="red">\$</span>ba
262+
</div>
263+
</div>
264+
</div>
265+
266+
<div class="demo-button-block">
267+
<div></div>
268+
<button class="demo-button" id="prev" onclick="p()">上一步</button>
269+
<div></div>
270+
<button class="demo-button" id="next" onclick="n()">下一步</button>
271+
<div></div>
272+
</div>
273+
274+
<style>
275+
.demo-button {
276+
padding: 6px 14px;
277+
border: 1px solid #ccc;
278+
background: #fff;
279+
cursor: pointer;
280+
border-radius: 4px;
281+
}
282+
.demo-button:disabled {
283+
opacity: .4;
284+
cursor: not-allowed;
285+
}
286+
.demo-block {
287+
display: grid;
288+
place-items: center;
289+
}
290+
.demo-button-block {
291+
display: grid;
292+
grid-template-columns: 1fr auto 30px auto 1fr;
293+
}
294+
.demo-window {
295+
height: 200px;
296+
font-family: monospace;
297+
border: 1px solid gray;
298+
padding: 5px 30px 5px 30px;
299+
margin: 5px 0 5px 0;
300+
}
301+
.red {
302+
color: red;
303+
}
304+
.green {
305+
color: green;
306+
}
307+
.gray {
308+
color: gray;
309+
}
310+
</style>
311+
312+
<script>
313+
let i = 0,
314+
v = document.getElementById('v'),
315+
c = v.children,
316+
prev = document.getElementById('prev'),
317+
next = document.getElementById('next');
318+
319+
let u = () => {
320+
[...c].forEach((d, n) => d.hidden = n !== i);
321+
prev.disabled = i === 0;
322+
next.disabled = i === c.length - 1;
323+
};
324+
325+
let p = () => { if (i > 0) i--; u() };
326+
let n = () => { if (i < c.length - 1) i++; u() };
327+
328+
u()
329+
</script>
330+
331+
332+
# 算法原理
333+
334+
觉得这个算法有意思的地方,可能并不是它的实用价值。这里有一个非常有意思:为什么这样排序了几次之后,就会回到最初的矩阵
335+
336+
这里蕴藏了一个非常有意思的字符串排序逻辑。
337+
338+
通常情况下,我们会使用字符串从第一个字符开始比较,如果相同则比下一个字符
339+
340+
而在这个问题下,假定所有字符串长度相同,那其实完全可以从最后一位比起,然后逐次比较新增加的字符,可以实现类似桶排序的方式,达成最终排序结果
341+
342+
由于后排序的结果会覆盖先排序的结果,使得实际上达成了“从第一个字符开始比较,如果相同则比下一个字符”的效果

0 commit comments

Comments
 (0)