Skip to content

Commit 330453b

Browse files
authored
Merge pull request #80 from awxkee/f16
Better complex multiplication
2 parents 02ba68b + f0a468f commit 330453b

File tree

10 files changed

+188
-78
lines changed

10 files changed

+188
-78
lines changed

.github/workflows/build_push.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ jobs:
134134
- uses: dtolnay/rust-toolchain@nightly
135135
- run: cargo install cargo-fuzz
136136
- run: cargo fuzz run filter_complex --features neon,nightly_fcma -- -max_total_time=17
137+
- run: cargo fuzz run motion --features neon,nightly_fcma -- -max_total_time=17
137138

138139
fuzz_filters_x86:
139140
name: Fuzzing Filters 1D/2D x86

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ workspace = { members = ["fuzz", "app/accelerate", "app"] }
22

33
[package]
44
name = "libblur"
5-
version = "0.20.0"
5+
version = "0.20.1"
66
edition = "2021"
77
description = "Fast image blurring in pure Rust"
88
readme = "./README.md"

app/benches/gauss/main.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
6060
&src_image,
6161
&mut dst_bytes,
6262
GaussianBlurParams::new_from_kernel(3.),
63-
EdgeMode2D::new(EdgeMode::Clamp.as_2d()),
63+
EdgeMode::Clamp.as_2d(),
6464
ThreadingPolicy::Adaptive,
6565
ConvolutionMode::FixedPoint,
6666
)
@@ -107,7 +107,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
107107
&src_image,
108108
&mut dst_bytes,
109109
GaussianBlurParams::new_from_kernel(13.),
110-
EdgeMode2D::new(EdgeMode::Clamp.as_2d()),
110+
EdgeMode::Clamp.as_2d(),
111111
ThreadingPolicy::Adaptive,
112112
ConvolutionMode::Exact,
113113
)
@@ -123,7 +123,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
123123
&src_image,
124124
&mut dst_bytes,
125125
GaussianBlurParams::new_from_kernel(13.),
126-
EdgeMode2D::new(EdgeMode::Clamp.as_2d()),
126+
EdgeMode::Clamp.as_2d(),
127127
ThreadingPolicy::Adaptive,
128128
ConvolutionMode::FixedPoint,
129129
)

app/src/main.rs

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use libblur::{
3636
bilateral_filter, complex_gaussian_kernel, fast_bilateral_filter, fast_bilateral_filter_u16,
3737
filter_1d_complex, filter_1d_complex_fixed_point, gaussian_blur, gaussian_kernel_1d,
3838
lens_kernel, sigma_size, AnisotropicRadius, BilateralBlurParams, BlurImage, BlurImageMut,
39-
BoxBlurParameters, CLTParameters, ConvolutionMode, EdgeMode, FastBlurChannels,
39+
BoxBlurParameters, CLTParameters, ConvolutionMode, EdgeMode, EdgeMode2D, FastBlurChannels,
4040
GaussianBlurParams, KernelShape, Scalar, ThreadingPolicy, TransferFunction,
4141
};
4242
use num_complex::Complex;
@@ -80,9 +80,9 @@ fn main() {
8080

8181
println!("{:?}", dyn_image.color());
8282

83-
let img = dyn_image.to_rgba8();
83+
let img = dyn_image.to_rgb8();
8484
let mut src_bytes = img.as_bytes();
85-
let components = 4;
85+
let components = 3;
8686
let stride = dimensions.0 as usize * components;
8787
let mut bytes: Vec<u8> = src_bytes.to_vec();
8888
let mut dst_bytes: Vec<u8> = src_bytes.to_vec();
@@ -92,10 +92,10 @@ fn main() {
9292
let mut v_vec = src_bytes
9393
.to_vec()
9494
.iter()
95-
// .map(|&x| x)
95+
.map(|&x| x)
9696
// .map(|&x| (x as f32 / 255.))
97-
.map(|&x| u16::from_ne_bytes([x, x]))
98-
.collect::<Vec<u16>>();
97+
// .map(|&x| u16::from_ne_bytes([x, x]))
98+
.collect::<Vec<u8>>();
9999

100100
// let mut dst_image = BlurImageMut::borrow(
101101
// &mut v_vec,
@@ -109,7 +109,7 @@ fn main() {
109109
&v_vec,
110110
dyn_image.width(),
111111
dyn_image.height(),
112-
FastBlurChannels::Channels4,
112+
FastBlurChannels::Channels3,
113113
);
114114
// let vcvt = cvt.linearize(TransferFunction::Srgb, true).unwrap();
115115

@@ -157,13 +157,11 @@ fn main() {
157157

158158
// }
159159

160-
libblur::box_blur_u16(
160+
libblur::sobel(
161161
&cvt,
162162
&mut dst_image,
163-
BoxBlurParameters {
164-
x_axis_kernel: 7,
165-
y_axis_kernel: 7,
166-
},
163+
EdgeMode2D::default(),
164+
Scalar::default(),
167165
ThreadingPolicy::Single,
168166
)
169167
.unwrap();
@@ -186,17 +184,17 @@ fn main() {
186184
// )
187185
// .unwrap();
188186

189-
let j_dag = dst_image.to_immutable_ref();
187+
// let j_dag = dst_image.to_immutable_ref();
190188

191189
// let gamma = j_dag.gamma8(TransferFunction::Srgb, true).unwrap();
192190

193191
dst_bytes = dst_image
194192
.data
195193
.borrow_mut()
196194
.iter()
197-
// .map(|&x| x)
195+
.map(|&x| x)
198196
// .map(|&x| (x * 255f32).round() as u8)
199-
.map(|&x| (x >> 8) as u8)
197+
// .map(|&x| (x >> 8) as u8)
200198
.collect::<Vec<u8>>();
201199

202200
// dst_bytes = dst_image.data.borrow().to_vec();
@@ -225,7 +223,7 @@ fn main() {
225223

226224
if components == 3 {
227225
image::save_buffer(
228-
"blurred_stack_next.jpg",
226+
"blurred_stack_next1.jpg",
229227
bytes.as_bytes(),
230228
dimensions.0,
231229
dimensions.1,
@@ -234,7 +232,7 @@ fn main() {
234232
.unwrap();
235233
} else {
236234
image::save_buffer(
237-
"blurred_stack_next_f.png",
235+
"blurred_stack_next_f1.png",
238236
bytes.as_bytes(),
239237
dimensions.0,
240238
dimensions.1,

src/filter2d/avx/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
*/
2929
mod convolve_op_u8_i16_fp;
3030
#[cfg(feature = "fft")]
31-
mod mul_spectrum;
31+
mod mul_spectrum_f32;
3232

3333
pub(crate) use convolve_op_u8_i16_fp::convolve_segment_sse_2d_u8_i16_fp;
3434
#[cfg(feature = "fft")]
35-
pub(crate) use mul_spectrum::avx_fma_mul_spectrum_in_place_f32;
35+
pub(crate) use mul_spectrum_f32::avx_fma_mul_spectrum_in_place_f32;
Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,13 @@ unsafe fn avx_interleave(a: __m256, b: __m256) -> (__m256, __m256) {
6565
}
6666

6767
#[inline]
68-
#[target_feature(enable = "avx2")]
69-
unsafe fn sse_unpacklo_ps(a: __m128i) -> (__m128, __m128) {
70-
let v2 = _mm_unpacklo_epi32(a, _mm_setzero_si128()); // a0 a2 b0 b2
71-
72-
let a = _mm_unpacklo_epi32(v2, _mm_setzero_si128()); // a0 a1 a2 a3
73-
let b = _mm_unpackhi_epi32(v2, _mm_setzero_si128()); // b0 b1 ab b3
74-
(_mm_castsi128_ps(a), _mm_castsi128_ps(b))
68+
#[target_feature(enable = "avx2", enable = "fma")]
69+
unsafe fn complex_mul_fma(a: __m128, b: __m128) -> __m128 {
70+
let temp1 = _mm_shuffle_ps::<0xA0>(b, b);
71+
let temp2 = _mm_shuffle_ps::<0xF5>(b, b);
72+
let mul2 = _mm_mul_ps(a, temp2);
73+
let mul2 = _mm_shuffle_ps::<0xB1>(mul2, mul2);
74+
_mm_fmaddsub_ps(a, temp1, mul2)
7575
}
7676

7777
#[target_feature(enable = "avx2", enable = "fma")]
@@ -89,15 +89,15 @@ unsafe fn mul_spectrum_in_place_f32_impl(
8989
let other = &other[..complex_size];
9090

9191
for (dst, kernel) in value1.chunks_exact_mut(16).zip(other.chunks_exact(16)) {
92-
let vd0 = _mm256_loadu_ps(dst.as_ptr() as *const f32);
93-
let vd1 = _mm256_loadu_ps(dst.get_unchecked(4..).as_ptr() as *const f32);
94-
let vd2 = _mm256_loadu_ps(dst.get_unchecked(8..).as_ptr() as *const f32);
95-
let vd3 = _mm256_loadu_ps(dst.get_unchecked(12..).as_ptr() as *const f32);
92+
let vd0 = _mm256_loadu_ps(dst.as_ptr().cast());
93+
let vd1 = _mm256_loadu_ps(dst.get_unchecked(4..).as_ptr().cast());
94+
let vd2 = _mm256_loadu_ps(dst.get_unchecked(8..).as_ptr().cast());
95+
let vd3 = _mm256_loadu_ps(dst.get_unchecked(12..).as_ptr().cast());
9696

97-
let vk0 = _mm256_loadu_ps(kernel.as_ptr() as *const f32);
98-
let vk1 = _mm256_loadu_ps(kernel.get_unchecked(4..).as_ptr() as *const f32);
99-
let vk2 = _mm256_loadu_ps(kernel.get_unchecked(8..).as_ptr() as *const f32);
100-
let vk3 = _mm256_loadu_ps(kernel.get_unchecked(12..).as_ptr() as *const f32);
97+
let vk0 = _mm256_loadu_ps(kernel.as_ptr().cast());
98+
let vk1 = _mm256_loadu_ps(kernel.get_unchecked(4..).as_ptr().cast());
99+
let vk2 = _mm256_loadu_ps(kernel.get_unchecked(8..).as_ptr().cast());
100+
let vk3 = _mm256_loadu_ps(kernel.get_unchecked(12..).as_ptr().cast());
101101

102102
let (ar0, ai0) = avx_deinterleave(vd0, vd1);
103103
let (ar1, ai1) = avx_deinterleave(vd2, vd3);
@@ -123,18 +123,18 @@ unsafe fn mul_spectrum_in_place_f32_impl(
123123
let (d0, d1) = avx_interleave(prod_r0, prod_i0);
124124
let (d2, d3) = avx_interleave(prod_r1, prod_i1);
125125

126-
_mm256_storeu_ps(dst.as_mut_ptr() as *mut f32, d0);
127-
_mm256_storeu_ps(dst.get_unchecked_mut(4..).as_mut_ptr() as *mut f32, d1);
128-
_mm256_storeu_ps(dst.get_unchecked_mut(8..).as_mut_ptr() as *mut f32, d2);
129-
_mm256_storeu_ps(dst.get_unchecked_mut(12..).as_mut_ptr() as *mut f32, d3);
126+
_mm256_storeu_ps(dst.as_mut_ptr().cast(), d0);
127+
_mm256_storeu_ps(dst.get_unchecked_mut(4..).as_mut_ptr().cast(), d1);
128+
_mm256_storeu_ps(dst.get_unchecked_mut(8..).as_mut_ptr().cast(), d2);
129+
_mm256_storeu_ps(dst.get_unchecked_mut(12..).as_mut_ptr().cast(), d3);
130130
}
131131

132132
let dst_rem = value1.chunks_exact_mut(16).into_remainder();
133133
let src_rem = other.chunks_exact(16).remainder();
134134

135135
for (dst, kernel) in dst_rem.chunks_exact_mut(4).zip(src_rem.chunks_exact(4)) {
136-
let a0 = _mm256_loadu_ps(dst.as_ptr() as *const f32);
137-
let b0 = _mm256_loadu_ps(kernel.as_ptr() as *const f32);
136+
let a0 = _mm256_loadu_ps(dst.as_ptr().cast());
137+
let b0 = _mm256_loadu_ps(kernel.as_ptr().cast());
138138

139139
let (ar0, ai0) = avx_deinterleave(a0, _mm256_setzero_ps());
140140
let (br0, bi0) = avx_deinterleave(b0, _mm256_setzero_ps());
@@ -149,7 +149,7 @@ unsafe fn mul_spectrum_in_place_f32_impl(
149149

150150
let (d0, _) = avx_interleave(prod_r0, prod_i0);
151151

152-
_mm256_storeu_ps(dst.as_mut_ptr() as *mut f32, d0);
152+
_mm256_storeu_ps(dst.as_mut_ptr().cast(), d0);
153153
}
154154

155155
let dst_rem = dst_rem.chunks_exact_mut(4).into_remainder();
@@ -159,18 +159,7 @@ unsafe fn mul_spectrum_in_place_f32_impl(
159159
let v0 = _mm_loadu_si64(dst as *const Complex<f32> as *const _);
160160
let v1 = _mm_loadu_si64(kernel as *const Complex<f32> as *const _);
161161

162-
let (ar0, ai0) = sse_unpacklo_ps(v0);
163-
let (br0, bi0) = sse_unpacklo_ps(v1);
164-
165-
let mut prod_r0 = _mm_mul_ps(ar0, br0);
166-
let mut prod_i0 = _mm_mul_ps(ar0, bi0);
167-
prod_r0 = _mm_fnmadd_ps(ai0, bi0, prod_r0);
168-
prod_i0 = _mm_fmadd_ps(ai0, br0, prod_i0);
169-
170-
prod_r0 = _mm_mul_ps(prod_r0, _mm256_castps256_ps128(v_norm_factor));
171-
prod_i0 = _mm_mul_ps(prod_i0, _mm256_castps256_ps128(v_norm_factor));
172-
173-
let lo = _mm_unpacklo_ps(prod_r0, prod_i0);
162+
let lo = complex_mul_fma(_mm_castsi128_ps(v0), _mm_castsi128_ps(v1));
174163

175164
_mm_storeu_si64(dst as *mut Complex<f32> as *mut _, _mm_castps_si128(lo));
176165
}

src/filter2d/mul_spectrum.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ impl SpectrumMultiplier<f32> for f32 {
7272
#[cfg(all(target_arch = "aarch64", feature = "nightly_fcma"))]
7373
{
7474
if std::arch::is_aarch64_feature_detected!("fcma") {
75-
use crate::filter2d::neon::neon_mul_spectrum_in_place_f32;
76-
return neon_mul_spectrum_in_place_f32(value1, other, width, height);
75+
use crate::filter2d::neon::fcma_mul_spectrum_in_place_f32;
76+
return fcma_mul_spectrum_in_place_f32(value1, other, width, height);
7777
}
7878
}
7979
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
@@ -96,7 +96,15 @@ impl SpectrumMultiplier<f32> for f32 {
9696
}
9797
}
9898
}
99-
mul_spectrum_in_place_impl(value1, other, width, height);
99+
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
100+
{
101+
use crate::filter2d::neon::neon_mul_spectrum_in_place_f32;
102+
neon_mul_spectrum_in_place_f32(value1, other, width, height);
103+
}
104+
#[cfg(not(all(target_arch = "aarch64", feature = "neon")))]
105+
{
106+
mul_spectrum_in_place_impl(value1, other, width, height);
107+
}
100108
}
101109
}
102110

@@ -111,6 +119,7 @@ impl SpectrumMultiplier<f64> for f64 {
111119
}
112120
}
113121

122+
#[allow(dead_code)]
114123
#[inline(always)]
115124
fn mul_spectrum_in_place_impl<V: FftNum + Mul<V>>(
116125
value1: &mut [Complex<V>],

src/filter2d/neon/mod.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,14 @@ mod convolve_op_u8_f32;
3030
mod convolve_op_u8_i16;
3131
mod convolve_op_u8_i16_fp;
3232
#[cfg(all(feature = "nightly_fcma", feature = "fft"))]
33-
mod mul_spectrum;
33+
mod mul_spectrum_fcma;
34+
#[cfg(feature = "fft")]
35+
mod mul_spectrum_neon;
3436

3537
pub(crate) use convolve_op_u8_f32::convolve_segment_neon_2d_u8_f32;
3638
pub(crate) use convolve_op_u8_i16::convolve_segment_neon_2d_u8_i16;
3739
pub(crate) use convolve_op_u8_i16_fp::convolve_segment_neon_2d_u8_i16_fp;
3840
#[cfg(all(feature = "nightly_fcma", feature = "fft"))]
39-
pub(crate) use mul_spectrum::neon_mul_spectrum_in_place_f32;
41+
pub(crate) use mul_spectrum_fcma::fcma_mul_spectrum_in_place_f32;
42+
#[cfg(feature = "fft")]
43+
pub(crate) use mul_spectrum_neon::neon_mul_spectrum_in_place_f32;
Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
use rustfft::num_complex::Complex;
3030
use std::arch::aarch64::*;
3131

32-
pub(crate) fn neon_mul_spectrum_in_place_f32(
32+
pub(crate) fn fcma_mul_spectrum_in_place_f32(
3333
value1: &mut [Complex<f32>],
3434
other: &[Complex<f32>],
3535
width: usize,
@@ -56,15 +56,15 @@ unsafe fn mul_spectrum_in_place_f32_impl(
5656
let zero = vdupq_n_f32(0.);
5757

5858
for (dst, kernel) in value1.chunks_exact_mut(8).zip(other.chunks_exact(8)) {
59-
let vd0 = vld1q_f32(dst.as_ptr() as *const f32);
60-
let vd1 = vld1q_f32(dst.as_ptr().add(2) as *const f32);
61-
let vd2 = vld1q_f32(dst.as_ptr().add(4) as *const f32);
62-
let vd3 = vld1q_f32(dst.as_ptr().add(6) as *const f32);
59+
let vd0 = vld1q_f32(dst.as_ptr().cast());
60+
let vd1 = vld1q_f32(dst.as_ptr().add(2).cast());
61+
let vd2 = vld1q_f32(dst.as_ptr().add(4).cast());
62+
let vd3 = vld1q_f32(dst.as_ptr().add(6).cast());
6363

64-
let vk0 = vld1q_f32(kernel.as_ptr() as *const f32);
65-
let vk1 = vld1q_f32(kernel.as_ptr().add(2) as *const f32);
66-
let vk2 = vld1q_f32(kernel.as_ptr().add(4) as *const f32);
67-
let vk3 = vld1q_f32(kernel.as_ptr().add(6) as *const f32);
64+
let vk0 = vld1q_f32(kernel.as_ptr().cast());
65+
let vk1 = vld1q_f32(kernel.as_ptr().add(2).cast());
66+
let vk2 = vld1q_f32(kernel.as_ptr().add(4).cast());
67+
let vk3 = vld1q_f32(kernel.as_ptr().add(6).cast());
6868

6969
let p0 = vmulq_f32(
7070
vcmlaq_rot90_f32(vcmlaq_f32(zero, vd0, vk0), vd0, vk0),
@@ -83,21 +83,21 @@ unsafe fn mul_spectrum_in_place_f32_impl(
8383
v_norm_factor,
8484
);
8585

86-
vst1q_f32(dst.as_mut_ptr() as *mut f32, p0);
87-
vst1q_f32(dst.get_unchecked_mut(2..).as_mut_ptr() as *mut f32, p1);
88-
vst1q_f32(dst.get_unchecked_mut(4..).as_mut_ptr() as *mut f32, p2);
89-
vst1q_f32(dst.get_unchecked_mut(6..).as_mut_ptr() as *mut f32, p3);
86+
vst1q_f32(dst.as_mut_ptr().cast(), p0);
87+
vst1q_f32(dst.get_unchecked_mut(2..).as_mut_ptr().cast(), p1);
88+
vst1q_f32(dst.get_unchecked_mut(4..).as_mut_ptr().cast(), p2);
89+
vst1q_f32(dst.get_unchecked_mut(6..).as_mut_ptr().cast(), p3);
9090
}
9191

9292
let dst_rem = value1.chunks_exact_mut(8).into_remainder();
9393
let src_rem = other.chunks_exact(8).remainder();
9494

9595
for (dst, kernel) in dst_rem.chunks_exact_mut(2).zip(src_rem.chunks_exact(2)) {
96-
let v0 = vld1q_f32(dst.as_ptr() as *const f32);
97-
let v1 = vld1q_f32(kernel.as_ptr() as *const f32);
96+
let v0 = vld1q_f32(dst.as_ptr().cast());
97+
let v1 = vld1q_f32(kernel.as_ptr().cast());
9898
let p0 = vcmlaq_rot90_f32(vcmlaq_f32(zero, v0, v1), v0, v1);
9999
let p1 = vmulq_f32(p0, v_norm_factor);
100-
vst1q_f32(dst.as_mut_ptr() as *mut f32, p1);
100+
vst1q_f32(dst.as_mut_ptr().cast(), p1);
101101
}
102102

103103
let dst_rem = dst_rem.chunks_exact_mut(2).into_remainder();

0 commit comments

Comments
 (0)