11% faster on penryn, 7% on sandybridge, 5% on bulldozer
Negligible change to output.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
... | ... |
@@ -33,32 +33,32 @@ |
33 | 33 |
#include "video.h" |
34 | 34 |
|
35 | 35 |
typedef struct { |
36 |
- int coefs[4][512*16]; |
|
37 |
- uint32_t *line; |
|
36 |
+ int16_t coefs[4][512*16]; |
|
37 |
+ uint16_t *line; |
|
38 | 38 |
uint16_t *frame_prev[3]; |
39 | 39 |
int hsub, vsub; |
40 | 40 |
} HQDN3DContext; |
41 | 41 |
|
42 |
-static inline uint32_t lowpass(unsigned int prev, unsigned int cur, int *coef) |
|
42 |
+static inline uint32_t lowpass(int prev, int cur, int16_t *coef) |
|
43 | 43 |
{ |
44 |
- int dmul = prev-cur; |
|
45 |
- unsigned int d = (dmul+0x10007FF)>>12; // 0x1000 to convert to unsigned, 7FF for rounding |
|
44 |
+ int d = (prev-cur)>>4; |
|
46 | 45 |
return cur + coef[d]; |
47 | 46 |
} |
48 | 47 |
|
49 | 48 |
static void denoise_temporal(uint8_t *src, uint8_t *dst, |
50 | 49 |
uint16_t *frame_ant, |
51 | 50 |
int w, int h, int sstride, int dstride, |
52 |
- int *temporal) |
|
51 |
+ int16_t *temporal) |
|
53 | 52 |
{ |
54 | 53 |
long x, y; |
55 | 54 |
uint32_t tmp; |
56 | 55 |
|
56 |
+ temporal += 0x1000; |
|
57 |
+ |
|
57 | 58 |
for (y = 0; y < h; y++) { |
58 | 59 |
for (x = 0; x < w; x++) { |
59 |
- tmp = lowpass(frame_ant[x]<<8, src[x]<<16, temporal); |
|
60 |
- frame_ant[x] = (tmp+0x7F)>>8; |
|
61 |
- dst[x] = (tmp+0x7FFF)>>16; |
|
60 |
+ frame_ant[x] = tmp = lowpass(frame_ant[x], src[x]<<8, temporal); |
|
61 |
+ dst[x] = (tmp+0x7F)>>8; |
|
62 | 62 |
} |
63 | 63 |
src += sstride; |
64 | 64 |
dst += dstride; |
... | ... |
@@ -67,47 +67,47 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst, |
67 | 67 |
} |
68 | 68 |
|
69 | 69 |
static void denoise_spatial(uint8_t *src, uint8_t *dst, |
70 |
- uint32_t *line_ant, uint16_t *frame_ant, |
|
70 |
+ uint16_t *line_ant, uint16_t *frame_ant, |
|
71 | 71 |
int w, int h, int sstride, int dstride, |
72 |
- int *spatial, int *temporal) |
|
72 |
+ int16_t *spatial, int16_t *temporal) |
|
73 | 73 |
{ |
74 | 74 |
long x, y; |
75 | 75 |
uint32_t pixel_ant; |
76 | 76 |
uint32_t tmp; |
77 | 77 |
|
78 |
+ spatial += 0x1000; |
|
79 |
+ temporal += 0x1000; |
|
80 |
+ |
|
78 | 81 |
/* First line has no top neighbor. Only left one for each tmp and |
79 | 82 |
* last frame */ |
80 |
- pixel_ant = src[0]<<16; |
|
83 |
+ pixel_ant = src[0]<<8; |
|
81 | 84 |
for (x = 0; x < w; x++) { |
82 |
- line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]<<16, spatial); |
|
83 |
- tmp = lowpass(frame_ant[x]<<8, tmp, temporal); |
|
84 |
- frame_ant[x] = (tmp+0x7F)>>8; |
|
85 |
- dst[x] = (tmp+0x7FFF)>>16; |
|
85 |
+ line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]<<8, spatial); |
|
86 |
+ frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal); |
|
87 |
+ dst[x] = (tmp+0x7F)>>8; |
|
86 | 88 |
} |
87 | 89 |
|
88 | 90 |
for (y = 1; y < h; y++) { |
89 | 91 |
src += sstride; |
90 | 92 |
dst += dstride; |
91 | 93 |
frame_ant += w; |
92 |
- pixel_ant = src[0]<<16; |
|
94 |
+ pixel_ant = src[0]<<8; |
|
93 | 95 |
for (x = 0; x < w-1; x++) { |
94 | 96 |
line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial); |
95 |
- pixel_ant = lowpass(pixel_ant, src[x+1]<<16, spatial); |
|
96 |
- tmp = lowpass(frame_ant[x]<<8, tmp, temporal); |
|
97 |
- frame_ant[x] = (tmp+0x7F)>>8; |
|
98 |
- dst[x] = (tmp+0x7FFF)>>16; |
|
97 |
+ pixel_ant = lowpass(pixel_ant, src[x+1]<<8, spatial); |
|
98 |
+ frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal); |
|
99 |
+ dst[x] = (tmp+0x7F)>>8; |
|
99 | 100 |
} |
100 | 101 |
line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial); |
101 |
- tmp = lowpass(frame_ant[x]<<8, tmp, temporal); |
|
102 |
- frame_ant[x] = (tmp+0x7F)>>8; |
|
103 |
- dst[x] = (tmp+0x7FFF)>>16; |
|
102 |
+ frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal); |
|
103 |
+ dst[x] = (tmp+0x7F)>>8; |
|
104 | 104 |
} |
105 | 105 |
} |
106 | 106 |
|
107 | 107 |
static void denoise(uint8_t *src, uint8_t *dst, |
108 |
- uint32_t *line_ant, uint16_t **frame_ant_ptr, |
|
108 |
+ uint16_t *line_ant, uint16_t **frame_ant_ptr, |
|
109 | 109 |
int w, int h, int sstride, int dstride, |
110 |
- int *spatial, int *temporal) |
|
110 |
+ int16_t *spatial, int16_t *temporal) |
|
111 | 111 |
{ |
112 | 112 |
long x, y; |
113 | 113 |
uint16_t *frame_ant = *frame_ant_ptr; |
... | ... |
@@ -129,16 +129,18 @@ static void denoise(uint8_t *src, uint8_t *dst, |
129 | 129 |
w, h, sstride, dstride, temporal); |
130 | 130 |
} |
131 | 131 |
|
132 |
-static void precalc_coefs(int *ct, double dist25) |
|
132 |
+static void precalc_coefs(int16_t *ct, double dist25) |
|
133 | 133 |
{ |
134 | 134 |
int i; |
135 | 135 |
double gamma, simil, C; |
136 | 136 |
|
137 |
- gamma = log(0.25) / log(1.0 - dist25/255.0 - 0.00001); |
|
137 |
+ gamma = log(0.25) / log(1.0 - FFMIN(dist25,252.0)/255.0 - 0.00001); |
|
138 | 138 |
|
139 | 139 |
for (i = -255*16; i <= 255*16; i++) { |
140 |
- simil = 1.0 - FFABS(i) / (16*255.0); |
|
141 |
- C = pow(simil, gamma) * 65536.0 * i / 16.0; |
|
140 |
+ // lowpass() truncates (not rounds) the diff, so +15/32 for the midpoint of the bin. |
|
141 |
+ double f = (i + 15.0/32.0) / 16.0; |
|
142 |
+ simil = 1.0 - FFABS(f) / 255.0; |
|
143 |
+ C = pow(simil, gamma) * 256.0 * f; |
|
142 | 144 |
ct[16*256+i] = lrint(C); |
143 | 145 |
} |
144 | 146 |
|