Browse code

vf_hqdn3d: reduce intermediate precision

11% faster on penryn, 7% on sandybridge, 5% on bulldozer
Negligible change to output.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>

Loren Merritt authored on 2012/07/27 07:51:15
Showing 1 changed files
... ...
@@ -33,32 +33,32 @@
33 33
 #include "video.h"
34 34
 
35 35
 typedef struct {
36
-    int coefs[4][512*16];
37
-    uint32_t *line;
36
+    int16_t coefs[4][512*16];
37
+    uint16_t *line;
38 38
     uint16_t *frame_prev[3];
39 39
     int hsub, vsub;
40 40
 } HQDN3DContext;
41 41
 
42
-static inline uint32_t lowpass(unsigned int prev, unsigned int cur, int *coef)
42
+static inline uint32_t lowpass(int prev, int cur, int16_t *coef)
43 43
 {
44
-    int dmul = prev-cur;
45
-    unsigned int d = (dmul+0x10007FF)>>12; // 0x1000 to convert to unsigned, 7FF for rounding
44
+    int d = (prev-cur)>>4;
46 45
     return cur + coef[d];
47 46
 }
48 47
 
49 48
 static void denoise_temporal(uint8_t *src, uint8_t *dst,
50 49
                              uint16_t *frame_ant,
51 50
                              int w, int h, int sstride, int dstride,
52
-                             int *temporal)
51
+                             int16_t *temporal)
53 52
 {
54 53
     long x, y;
55 54
     uint32_t tmp;
56 55
 
56
+    temporal += 0x1000;
57
+
57 58
     for (y = 0; y < h; y++) {
58 59
         for (x = 0; x < w; x++) {
59
-            tmp = lowpass(frame_ant[x]<<8, src[x]<<16, temporal);
60
-            frame_ant[x] = (tmp+0x7F)>>8;
61
-            dst[x] = (tmp+0x7FFF)>>16;
60
+            frame_ant[x] = tmp = lowpass(frame_ant[x], src[x]<<8, temporal);
61
+            dst[x] = (tmp+0x7F)>>8;
62 62
         }
63 63
         src += sstride;
64 64
         dst += dstride;
... ...
@@ -67,47 +67,47 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
67 67
 }
68 68
 
69 69
 static void denoise_spatial(uint8_t *src, uint8_t *dst,
70
-                            uint32_t *line_ant, uint16_t *frame_ant,
70
+                            uint16_t *line_ant, uint16_t *frame_ant,
71 71
                             int w, int h, int sstride, int dstride,
72
-                            int *spatial, int *temporal)
72
+                            int16_t *spatial, int16_t *temporal)
73 73
 {
74 74
     long x, y;
75 75
     uint32_t pixel_ant;
76 76
     uint32_t tmp;
77 77
 
78
+    spatial  += 0x1000;
79
+    temporal += 0x1000;
80
+
78 81
     /* First line has no top neighbor. Only left one for each tmp and
79 82
      * last frame */
80
-    pixel_ant = src[0]<<16;
83
+    pixel_ant = src[0]<<8;
81 84
     for (x = 0; x < w; x++) {
82
-        line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]<<16, spatial);
83
-        tmp = lowpass(frame_ant[x]<<8, tmp, temporal);
84
-        frame_ant[x] = (tmp+0x7F)>>8;
85
-        dst[x] = (tmp+0x7FFF)>>16;
85
+        line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]<<8, spatial);
86
+        frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
87
+        dst[x] = (tmp+0x7F)>>8;
86 88
     }
87 89
 
88 90
     for (y = 1; y < h; y++) {
89 91
         src += sstride;
90 92
         dst += dstride;
91 93
         frame_ant += w;
92
-        pixel_ant = src[0]<<16;
94
+        pixel_ant = src[0]<<8;
93 95
         for (x = 0; x < w-1; x++) {
94 96
             line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
95
-            pixel_ant = lowpass(pixel_ant, src[x+1]<<16, spatial);
96
-            tmp = lowpass(frame_ant[x]<<8, tmp, temporal);
97
-            frame_ant[x] = (tmp+0x7F)>>8;
98
-            dst[x] = (tmp+0x7FFF)>>16;
97
+            pixel_ant = lowpass(pixel_ant, src[x+1]<<8, spatial);
98
+            frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
99
+            dst[x] = (tmp+0x7F)>>8;
99 100
         }
100 101
         line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
101
-        tmp = lowpass(frame_ant[x]<<8, tmp, temporal);
102
-        frame_ant[x] = (tmp+0x7F)>>8;
103
-        dst[x] = (tmp+0x7FFF)>>16;
102
+        frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
103
+        dst[x] = (tmp+0x7F)>>8;
104 104
     }
105 105
 }
106 106
 
107 107
 static void denoise(uint8_t *src, uint8_t *dst,
108
-                    uint32_t *line_ant, uint16_t **frame_ant_ptr,
108
+                    uint16_t *line_ant, uint16_t **frame_ant_ptr,
109 109
                     int w, int h, int sstride, int dstride,
110
-                    int *spatial, int *temporal)
110
+                    int16_t *spatial, int16_t *temporal)
111 111
 {
112 112
     long x, y;
113 113
     uint16_t *frame_ant = *frame_ant_ptr;
... ...
@@ -129,16 +129,18 @@ static void denoise(uint8_t *src, uint8_t *dst,
129 129
                          w, h, sstride, dstride, temporal);
130 130
 }
131 131
 
132
-static void precalc_coefs(int *ct, double dist25)
132
+static void precalc_coefs(int16_t *ct, double dist25)
133 133
 {
134 134
     int i;
135 135
     double gamma, simil, C;
136 136
 
137
-    gamma = log(0.25) / log(1.0 - dist25/255.0 - 0.00001);
137
+    gamma = log(0.25) / log(1.0 - FFMIN(dist25,252.0)/255.0 - 0.00001);
138 138
 
139 139
     for (i = -255*16; i <= 255*16; i++) {
140
-        simil = 1.0 - FFABS(i) / (16*255.0);
141
-        C = pow(simil, gamma) * 65536.0 * i / 16.0;
140
+        // lowpass() truncates (not rounds) the diff, so +15/32 for the midpoint of the bin.
141
+        double f = (i + 15.0/32.0) / 16.0;
142
+        simil = 1.0 - FFABS(f) / 255.0;
143
+        C = pow(simil, gamma) * 256.0 * f;
142 144
         ct[16*256+i] = lrint(C);
143 145
     }
144 146