Browse code

integrate the DLP code from Marty Roesch

git-svn: trunk@3795

Tomasz Kojm authored on 2008/04/17 03:47:42
Showing 11 changed files
... ...
@@ -1,3 +1,9 @@
1
+Wed Apr 16 20:10:17 CEST 2008 (tk)
2
+----------------------------------
3
+  * libclamav: integrate the DLP code from Marty Roesch
4
+  * clamscan: new switch --detect-structured
5
+  * TODO: clamd, docs, dconf, fine-tuning options
6
+
1 7
 Wed Apr 16 19:32:12 CEST 2008 (acab)
2 8
 ------------------------------------
3 9
   * configure: check for bzip2 CVE-2008-1372 - bb#903
... ...
@@ -306,6 +306,7 @@ void help(void)
306 306
 #endif
307 307
     mprintf("\n");
308 308
     mprintf("    --detect-pua                         Detect Possibly Unwanted Applications\n");
309
+    mprintf("    --detect-structured                  Detect structured data (SSN, Credit Card)\n");
309 310
     mprintf("    --no-mail                            Disable mail file support\n");
310 311
     mprintf("    --no-phishing-sigs                   Disable signature-based phishing detection\n");
311 312
     mprintf("    --no-phishing-scan-urls              Disable url-based phishing detection\n");
... ...
@@ -59,6 +59,7 @@ static struct option clamscan_longopt[] = {
59 59
     {"max-recursion", 1, 0, 0},
60 60
     {"max-dir-recursion", 1, 0, 0},
61 61
     {"detect-pua", 0, 0, 0},
62
+    {"detect-structured", 0, 0, 0},
62 63
     {"disable-archive", 0, 0, 0},
63 64
     {"no-archive", 0, 0, 0},
64 65
     {"detect-broken", 0, 0, 0},
... ...
@@ -308,6 +308,15 @@ int scanmanager(const struct optstruct *opt)
308 308
     else
309 309
 	options |= CL_SCAN_ALGORITHMIC;
310 310
 
311
+    if(opt_check(opt, "detect-structured")) {
312
+	options |= CL_SCAN_STRUCTURED;
313
+        limits.min_cc_count = 1;
314
+        limits.min_ssn_count = 1;
315
+        limits.structured_flags = CL_STRUCTURED_CONF_SSN_BOTH;
316
+    } else
317
+	options &= ~CL_SCAN_STRUCTURED;
318
+
319
+
311 320
 #ifdef C_LINUX
312 321
     procdev = (dev_t) 0;
313 322
     if(stat("/proc", &sb) != -1 && !sb.st_size)
... ...
@@ -186,7 +186,9 @@ libclamav_la_SOURCES = \
186 186
 	explode.c \
187 187
 	explode.h \
188 188
 	textnorm.c \
189
-	textnorm.h
189
+	textnorm.h \
190
+	dlp.c \
191
+	dlp.h
190 192
 
191 193
 libclamav_internal_utils_la_SOURCES=str.c \
192 194
 				    str.h \
... ...
@@ -55,7 +55,7 @@ target_triplet = @target@
55 55
 @VERSIONSCRIPT_TRUE@am__append_1 = -Wl,@VERSIONSCRIPTFLAG@,@top_srcdir@/libclamav/libclamav.map
56 56
 subdir = libclamav
57 57
 DIST_COMMON = $(include_HEADERS) $(srcdir)/Makefile.am \
58
-	$(srcdir)/Makefile.in
58
+	$(srcdir)/Makefile.in COPYING
59 59
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
60 60
 am__aclocal_m4_deps = $(top_srcdir)/m4/acinclude.m4 \
61 61
 	$(top_srcdir)/m4/lib-link.m4 $(top_srcdir)/m4/lib-prefix.m4 \
... ...
@@ -89,7 +89,7 @@ am_libclamav_la_OBJECTS = matcher-ac.lo matcher-bm.lo matcher.lo \
89 89
 	infblock.lo pdf.lo spin.lo yc.lo elf.lo sis.lo uuencode.lo \
90 90
 	phishcheck.lo phish_domaincheck_db.lo phish_whitelist.lo \
91 91
 	regex_list.lo mspack.lo cab.lo entconv.lo hashtab.lo dconf.lo \
92
-	lzma_iface.lo explode.lo textnorm.lo
92
+	lzma_iface.lo explode.lo textnorm.lo dlp.lo
93 93
 libclamav_la_OBJECTS = $(am_libclamav_la_OBJECTS)
94 94
 libclamav_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
95 95
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
... ...
@@ -421,7 +421,9 @@ libclamav_la_SOURCES = \
421 421
 	explode.c \
422 422
 	explode.h \
423 423
 	textnorm.c \
424
-	textnorm.h
424
+	textnorm.h \
425
+	dlp.c \
426
+	dlp.h
425 427
 
426 428
 libclamav_internal_utils_la_SOURCES = str.c \
427 429
 				    str.h \
... ...
@@ -522,6 +524,7 @@ distclean-compile:
522 522
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chmunpack.Plo@am__quote@
523 523
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cvd.Plo@am__quote@
524 524
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dconf.Plo@am__quote@
525
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dlp.Plo@am__quote@
525 526
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dsig.Plo@am__quote@
526 527
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/elf.Plo@am__quote@
527 528
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/entconv.Plo@am__quote@
... ...
@@ -92,6 +92,8 @@ extern "C"
92 92
 #define CL_SCAN_PHISHING_BLOCKCLOAK 0x1000
93 93
 #define CL_SCAN_ELF		    0x2000
94 94
 #define CL_SCAN_PDF		    0x4000
95
+#define CL_SCAN_STRUCTURED	    0x8000
96
+
95 97
 
96 98
 /* recommended scan settings */
97 99
 #define CL_SCAN_STDOPT		(CL_SCAN_ARCHIVE | CL_SCAN_MAIL | CL_SCAN_OLE2 | CL_SCAN_HTML | CL_SCAN_PE | CL_SCAN_ALGORITHMIC | CL_SCAN_ELF)
... ...
@@ -143,6 +145,11 @@ struct cl_engine {
143 143
     void *ignored;
144 144
 };
145 145
 
146
+/* Structured data flags */
147
+#define CL_STRUCTURED_CONF_SSN_BOTH        0x00
148
+#define CL_STRUCTURED_CONF_SSN_NORMAL      0x01
149
+#define CL_STRUCTURED_CONF_SSN_STRIPPED    0x02
150
+
146 151
 struct cl_limits {
147 152
     unsigned long int maxscansize;  /* during the scanning of archives this size
148 153
 				     * will never be exceeded
... ...
@@ -155,6 +162,14 @@ struct cl_limits {
155 155
 				     * within a single archive
156 156
 				     */
157 157
     unsigned short archivememlim;   /* limit memory usage for some unpackers */
158
+
159
+    /* This is for structured data detection.  You can set the minimum
160
+     * number of occurences of an CC# or SSN before the system will
161
+     * generate a notification.
162
+     */
163
+    unsigned long min_cc_count;
164
+    unsigned long min_ssn_count;
165
+    unsigned long structured_flags;
158 166
 };
159 167
 
160 168
 struct cl_stat {
161 169
new file mode 100644
... ...
@@ -0,0 +1,354 @@
0
+/* 
1
+ *  Simple library to detect and validate SSN and Credit Card numbers.
2
+ *
3
+ *  Copyright (C) 2007-2008 Sourcefire, Inc.
4
+ *
5
+ *  Authors: Martin Roesch <roesch@sourcefire.com>
6
+ *
7
+ *  This program is free software; you can redistribute it and/or modify
8
+ *  it under the terms of the GNU General Public License version 2 as
9
+ *  published by the Free Software Foundation.
10
+ *
11
+ *  This program is distributed in the hope that it will be useful,
12
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
+ *  GNU General Public License for more details.
15
+ *
16
+ *  You should have received a copy of the GNU General Public License
17
+ *  along with this program; if not, write to the Free Software
18
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19
+ *  MA 02110-1301, USA.
20
+ */
21
+
22
+#if HAVE_CONFIG_H
23
+#include "clamav-config.h"
24
+#endif
25
+
26
+#include <stdio.h>
27
+#include <ctype.h>  
28
+#include <string.h>
29
+#include <stdarg.h>
30
+#include <stdlib.h>
31
+#include "dlp.h"
32
+
33
+/* detection mode macros for the contains_* functions */
34
+#define DETECT_MODE_DETECT  0
35
+#define DETECT_MODE_COUNT   1
36
+
37
+/* group number mapping is here */
38
+/* http://www.socialsecurity.gov/employer/highgroup.txt */
39
+/* here's a perl script to convert the raw data from the highgroup.txt
40
+ * file to the data set in ssn_max_group[]:
41
+--
42
+local $/;
43
+my $i = <>;
44
+my $count = 0;
45
+while ($i =~ s/(\d{3}) (\d{2})//) {
46
+    print int($2) .", ";
47
+    if ($count == 18) 
48
+    {
49
+        print "\n";
50
+        $count = 0;
51
+    }
52
+    else
53
+    {
54
+        $count++;
55
+    }
56
+ }
57
+ --
58
+  *
59
+  * run 'perl convert.pl < highgroup.txt' to generate the data
60
+  *
61
+  */
62
+
63
+/* MAX_AREA is the maximum assigned area number.  This can be derived from 
64
+ * the data in the highgroup.txt file by looking at the last area->group 
65
+ * mapping from that file.
66
+ */ 
67
+#define MAX_AREA 772
68
+ 
69
+/* array of max group numbers for a given area number */
70
+static int ssn_max_group[MAX_AREA+1] = { 0,
71
+    6, 6, 4, 8, 8, 8, 6, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 
72
+    90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 88, 88, 88, 88, 72, 72, 72, 72, 
73
+    70, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 96, 96, 96, 96, 96, 96, 96, 96, 
74
+    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 
75
+    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 
76
+    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 
77
+    96, 96, 96, 96, 96, 96, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 
78
+    94, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 17, 17, 17, 17, 17, 17, 
79
+    17, 17, 17, 17, 17, 17, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 
80
+    84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 82, 82, 82, 82, 82, 82, 82, 82, 
81
+    82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 
82
+    82, 82, 79, 79, 79, 79, 79, 79, 79, 79, 77, 6, 4, 99, 99, 99, 99, 99, 99, 
83
+    99, 99, 99, 53, 53, 53, 53, 53, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 
84
+    99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 
85
+    99, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
86
+    13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 33, 33, 
87
+    31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 6, 6, 6, 6, 6, 6, 
88
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
89
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
90
+    35, 35, 35, 35, 35, 35, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 
91
+    33, 33, 33, 33, 33, 33, 29, 29, 29, 29, 29, 29, 29, 29, 27, 27, 27, 27, 27, 
92
+    67, 67, 67, 67, 67, 67, 67, 67, 99, 99, 99, 99, 99, 99, 99, 99, 63, 61, 61, 
93
+    61, 61, 61, 61, 61, 61, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 
94
+    99, 99, 23, 23, 23, 23, 23, 23, 23, 21, 21, 99, 99, 99, 99, 99, 99, 99, 99, 
95
+    99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 51, 51, 51, 51, 49, 49, 49, 49, 
96
+    49, 49, 37, 37, 37, 37, 37, 37, 37, 37, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
97
+    25, 25, 25, 23, 23, 23, 33, 33, 41, 39, 53, 51, 51, 51, 27, 27, 27, 27, 27, 
98
+    27, 27, 45, 43, 79, 77, 55, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 63, 63, 
99
+    63, 61, 61, 61, 61, 61, 61, 75, 73, 73, 73, 73, 99, 99, 99, 99, 99, 99, 99, 
100
+    99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 
101
+    99, 99, 99, 51, 99, 99, 45, 45, 43, 37, 99, 99, 99, 99, 99, 61, 99, 3, 99, 
102
+    99, 99, 99, 99, 99, 99, 84, 84, 84, 84, 99, 99, 67, 67, 65, 65, 65, 65, 65, 
103
+    65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 11, 
104
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 96, 
105
+    96, 44, 44, 46, 46, 46, 44, 28, 26, 26, 26, 26, 16, 16, 16, 14, 14, 14, 14, 
106
+    36, 34, 34, 34, 34, 34, 34, 34, 34, 14, 14, 12, 12, 90, 14, 14, 14, 14, 12, 
107
+    12, 12, 12, 12, 12, 9, 9, 7, 7, 7, 7, 7, 7, 7, 18, 18, 18, 18, 18, 
108
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 
109
+    28, 18, 18, 10, 14, 10, 10, 10, 10, 10, 9, 9, 3, 1, 5, 5, 5, 5, 5, 
110
+    5, 3, 3, 82, 82, 66, 66, 64, 64, 64, 64, 64
111
+};
112
+
113
+
114
+
115
+int dlp_is_valid_cc(const unsigned char *buffer, int length)
116
+{
117
+    int even = 0;
118
+    int sum = 0;
119
+    int i = 0;
120
+    int val = 0;
121
+    
122
+    if(buffer == NULL || length < 13)
123
+        return 0;
124
+
125
+    /* if the first digit is greater than 6 it isn't one of the major
126
+     * credit cards
127
+     * reference => http://www.beachnet.com/~hstiles/cardtype.html
128
+     */
129
+    if(buffer[0] > '6')
130
+        return 0;
131
+        
132
+    if(length > 16)
133
+        length = 16;
134
+
135
+    for(i = length - 1; i > -1; i--)
136
+    {
137
+        if(isdigit(buffer[i]) == 0)
138
+            continue;
139
+        
140
+        val = buffer[i] - '0';
141
+        
142
+        if(even)
143
+        {
144
+            if((val *= 2) > 9) val = (val - 10) + 1;
145
+        }
146
+        
147
+        even = !even;
148
+        sum += val;
149
+    }
150
+    
151
+    return (sum % 10 == 0);
152
+}
153
+
154
+static int contains_cc(const unsigned char *buffer, int length, int detmode)
155
+{
156
+    const unsigned char *idx;
157
+    const unsigned char *end;
158
+    int count = 0;
159
+    
160
+    if(buffer == NULL || length < 13)
161
+    {
162
+        return 0;         
163
+    }
164
+
165
+    end = buffer + length;
166
+    idx = buffer;
167
+    while(idx < end)
168
+    {
169
+        if(isdigit(*idx))
170
+        {
171
+            if(dlp_is_valid_cc(idx, length - (idx - buffer)) == 1)
172
+            {
173
+                if(detmode == DETECT_MODE_DETECT)
174
+                    return 1;
175
+                else
176
+                {
177
+                    count++;
178
+                    /* if we got a valid match we should increment the idx ptr
179
+                     * to gain a little performance
180
+                     */
181
+                    idx += (length > 15?15:(length-1));
182
+                }
183
+            }
184
+        }
185
+        idx++;
186
+    }
187
+    
188
+    return count;
189
+}
190
+
191
+int dlp_get_cc_count(const unsigned char *buffer, int length)
192
+{
193
+    return contains_cc(buffer, length, DETECT_MODE_COUNT);
194
+}
195
+
196
+int dlp_has_cc(const unsigned char *buffer, int length)
197
+{
198
+    return contains_cc(buffer, length, DETECT_MODE_DETECT);
199
+}
200
+
201
+int dlp_is_valid_ssn(const unsigned char *buffer, int length, int format)
202
+{
203
+    int area_number;
204
+    int group_number;
205
+    int serial_number;
206
+    int minlength;
207
+    int retval = 1;
208
+    
209
+    if(buffer == NULL)
210
+        return 0;
211
+        
212
+    minlength = (format==SSN_FORMAT_HYPHENS?11:9);
213
+
214
+    if(length < minlength)
215
+        return 0;
216
+        
217
+    /* sscanf parses and (basically) validates the string for us */
218
+    switch(format)
219
+    {
220
+        case SSN_FORMAT_HYPHENS:
221
+            if(sscanf((const char *) buffer, 
222
+                      "%3d-%2d-%4d", 
223
+                      &area_number, 
224
+                      &group_number, 
225
+                      &serial_number) != 3)
226
+            {
227
+                return 0;
228
+            }       
229
+            break;
230
+        case SSN_FORMAT_STRIPPED:
231
+             if(sscanf((const char *) buffer,  
232
+                       "%3d%2d%4d", 
233
+                       &area_number, 
234
+                       &group_number, 
235
+                       &serial_number) != 3)
236
+             {
237
+                 return 0;
238
+             }       
239
+             break;
240
+    }
241
+        
242
+    /* start validating */
243
+    /* validation data taken from 
244
+     * http://en.wikipedia.org/wiki/Social_Security_number_%28United_States%29
245
+     */
246
+    if(area_number > MAX_AREA || 
247
+       area_number == 666 || 
248
+       area_number <= 0 || 
249
+       group_number <= 0 || 
250
+       group_number > 99 || 
251
+       serial_number <= 0 ||
252
+       serial_number > 9999)
253
+        retval = 0;
254
+        
255
+    if(area_number == 987 && group_number == 65) 
256
+    {
257
+        if(serial_number >= 4320 && serial_number <= 4329)
258
+            retval = 0;
259
+    }
260
+    
261
+    if(group_number > ssn_max_group[area_number])
262
+        retval = 0;
263
+   
264
+    return retval;
265
+}
266
+
267
+static int contains_ssn(const unsigned char *buffer, int length, int format, int detmode)
268
+{
269
+    const unsigned char *idx;
270
+    const unsigned char *end;
271
+    int count = 0;
272
+    
273
+    if(buffer == NULL || length < 11)
274
+        return 0; 
275
+
276
+    end = buffer + length;
277
+    idx = buffer;
278
+    while(idx < end)
279
+    {
280
+        if(isdigit(*idx))
281
+        {
282
+            /* check for area number and the first hyphen */
283
+            if(dlp_is_valid_ssn(idx, length - (idx - buffer), format) == 1)
284
+            {
285
+                if(detmode == DETECT_MODE_COUNT)
286
+                {
287
+                    count++;
288
+                        /* hop over the matched bytes if we found an SSN */
289
+                    idx += ((format == SSN_FORMAT_HYPHENS)?11:9);
290
+                }
291
+                else
292
+                {
293
+                    return 1;                                                                            
294
+                }
295
+            }
296
+        }
297
+        idx++;
298
+    }
299
+    
300
+    return count;   
301
+}
302
+
303
+int dlp_get_stripped_ssn_count(const unsigned char *buffer, int length)
304
+{
305
+    return contains_ssn(buffer, 
306
+                        length, 
307
+                        SSN_FORMAT_STRIPPED, 
308
+                        DETECT_MODE_COUNT);
309
+}
310
+
311
+int dlp_get_normal_ssn_count(const unsigned char *buffer, int length)
312
+{
313
+    return contains_ssn(buffer, 
314
+                        length, 
315
+                        SSN_FORMAT_HYPHENS, 
316
+                        DETECT_MODE_COUNT);
317
+}
318
+
319
+int dlp_get_ssn_count(const unsigned char *buffer, int length)
320
+{
321
+    /* this will suck for performance but will find SSNs in either
322
+     * format
323
+     */
324
+    return (dlp_get_stripped_ssn_count(buffer, length) + dlp_get_normal_ssn_count(buffer, length));
325
+}
326
+
327
+int dlp_has_ssn(const unsigned char *buffer, int length)
328
+{
329
+    return (contains_ssn(buffer, 
330
+                         length, 
331
+                         SSN_FORMAT_HYPHENS, 
332
+                         DETECT_MODE_DETECT)
333
+            | contains_ssn(buffer, 
334
+                           length, 
335
+                           SSN_FORMAT_STRIPPED, 
336
+                           DETECT_MODE_DETECT));
337
+}
338
+
339
+int dlp_has_stripped_ssn(const unsigned char *buffer, int length)
340
+{
341
+    return contains_ssn(buffer, 
342
+                        length, 
343
+                        SSN_FORMAT_STRIPPED, 
344
+                        DETECT_MODE_DETECT);
345
+}
346
+
347
+int dlp_has_normal_ssn(const unsigned char *buffer, int length)
348
+{
349
+    return contains_ssn(buffer, 
350
+                        length, 
351
+                        SSN_FORMAT_HYPHENS, 
352
+                        DETECT_MODE_DETECT);
353
+}
0 354
new file mode 100644
... ...
@@ -0,0 +1,136 @@
0
+/* 
1
+ *  Simple library to detect and validate SSN and Credit Card numbers.
2
+ *
3
+ *  Copyright (C) 2007-2008 Sourcefire, Inc.
4
+ *
5
+ *  Authors: Martin Roesch <roesch@sourcefire.com>
6
+ *
7
+ *  This program is free software; you can redistribute it and/or modify
8
+ *  it under the terms of the GNU General Public License version 2 as
9
+ *  published by the Free Software Foundation.
10
+ *
11
+ *  This program is distributed in the hope that it will be useful,
12
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
+ *  GNU General Public License for more details.
15
+ *
16
+ *  You should have received a copy of the GNU General Public License
17
+ *  along with this program; if not, write to the Free Software
18
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19
+ *  MA 02110-1301, USA.
20
+ */
21
+
22
+#ifndef __DLP_H_
23
+#define __DLP_H_
24
+
25
+#include <stdio.h>
26
+
27
+/* these macros define the SSN string format to search for */
28
+#define SSN_FORMAT_HYPHENS    0     /* xxx-yy-zzzz */
29
+#define SSN_FORMAT_STRIPPED   1     /* xxxyyzzzz */
30
+
31
+/*
32
+ * will check if a valid credit card number exists within the 
33
+ * first 16 bytes of the supplied buffer.  Validation supplied
34
+ * via the Luhn algorithm.
35
+ * Params:
36
+ *      buffer => data buffer to be validated.
37
+ *      length => length of supplied buffer.  Values greater than 16 are
38
+ *                truncated to 16.  Values less than 13 are rejected. 
39
+ * Returns:
40
+ *      1 on a find, 0 on a miss
41
+ */
42
+int dlp_is_valid_cc(const unsigned char *buffer, int length);
43
+
44
+/* Searches the supplied buffer for credit card numbers and returns
45
+ * the number of CC's found.
46
+ * Params:
47
+ *      buffer => data buffer to be analyzed.
48
+ *      length => length of buffer.  
49
+ * Returns:
50
+ *      Count of detected CC #'s.
51
+ */
52
+int dlp_get_cc_count(const unsigned char *buffer, int length);
53
+
54
+/* Searches the supplied buffer for CC #'s.  Bails out as soon as a 
55
+ * validated number is detected.
56
+ * Params:
57
+ *      buffer => data buffer to be analyzed.
58
+ *      length => length of buffer.
59
+ * Returns:
60
+ *      1 on detect, 0 on fail
61
+ */
62
+int dlp_has_cc(const unsigned char *buffer, int length);
63
+
64
+/* Checks the supplied buffer for a valid SSN number.  Validation
65
+ * is supplied via area and group number validation.  Valid numbers
66
+ * which are not in circulation (666 series, 000 series) are NOT
67
+ * detected, only numbers that can be valid in the real world.  Searches
68
+ * only the first 11 or 9 bytes (based on the selected format)!
69
+ * Params:
70
+ *      buffer => buffer to be validated
71
+ *      length => length of buffer to validate
72
+ * Returns:
73
+ *      1 on detect, 0 on failure
74
+ */
75
+int dlp_is_valid_ssn(const unsigned char *buffer, int length, int format);
76
+
77
+/* Searches the supplied buffer for valid SSNs.  Note that this function
78
+ * is effectively searching the buffer TWICE looking for the hyphenated and
79
+ * stripped forms of the SSN.  There will be a performance impact!
80
+ * Params:
81
+ *      buffer => buffer to search
82
+ *      length => length of the buffer
83
+ * Returns:
84
+ *      Count of SSNs in the supplied buffer
85
+ */
86
+int dlp_get_ssn_count(const unsigned char *buffer, int length);
87
+
88
+/* Searches the supplied buffer for valid SSNs formatted as xxxyyzzzz.
89
+ * Params:
90
+ *      buffer => buffer to search
91
+ *      length => length of the buffer
92
+ * Returns:
93
+ *      Count of SSNs in the supplied buffer.
94
+ */
95
+int dlp_get_stripped_ssn_count(const unsigned char *buffer, int length);
96
+
97
+/* Searches the supplied buffer for valid SSNs formatted as xxx-yy-zzzz.
98
+ * Params:
99
+ *      buffer => buffer to search
100
+ *      length => length of the buffer
101
+ * Returns:
102
+ *      Count of SSNs in the supplied buffer.
103
+ */
104
+int dlp_get_normal_ssn_count(const unsigned char *buffer, int length);
105
+
106
+/* Searches the supplied buffer for a SSN in any format.  This searches the
107
+ * buffer twice for both the stripped and hyphenated versions of an SSN so
108
+ * there will be a performance impact!
109
+ * Params:
110
+ *      buffer => buffer to search
111
+ *      length => length of the buffer
112
+ * Returns:
113
+ *      1 on detect, 0 on fail
114
+ */
115
+int dlp_has_ssn(const unsigned char *buffer, int length);
116
+
117
+/* Searches the supplied buffer for a SSN in the stripped xxxyyzzzz format.
118
+ * Params:
119
+ *      buffer => buffer to search
120
+ *      length => length of the buffer
121
+ * Returns:
122
+ *      1 on detect, 0 on fail
123
+ */
124
+int dlp_has_stripped_ssn(const unsigned char *buffer, int length);
125
+
126
+/* Searches the supplied buffer for a SSN in the normal xxx-yy-zzzz format.
127
+ * Params:
128
+ *      buffer => buffer to search
129
+ *      length => length of the buffer
130
+ * Returns:
131
+ *      1 on detect, 0 on fail
132
+ */
133
+int dlp_has_normal_ssn(const unsigned char *buffer, int length);
134
+
135
+#endif  /* __DLP_H_ */
... ...
@@ -101,6 +101,7 @@ typedef struct {
101 101
 #define DETECT_ENCRYPTED    (ctx->options & CL_SCAN_BLOCKENCRYPTED)
102 102
 /* #define BLOCKMAX	    (ctx->options & CL_SCAN_BLOCKMAX) */
103 103
 #define DETECT_BROKEN	    (ctx->options & CL_SCAN_BLOCKBROKEN)
104
+#define SCAN_STRUCTURED	    (ctx->options & CL_SCAN_STRUCTURED)
104 105
 
105 106
 /* based on macros from A. Melnikoff */
106 107
 #define cbswap16(v) (((v & 0xff) << 8) | (((v) >> 8) & 0xff))
... ...
@@ -89,6 +89,7 @@
89 89
 #include "textnorm.h"
90 90
 #include <zlib.h>
91 91
 #include "unzip.h"
92
+#include "dlp.h"
92 93
 
93 94
 #ifdef HAVE_BZLIB_H
94 95
 #include <bzlib.h>
... ...
@@ -1491,6 +1492,76 @@ static int cli_scanmail(int desc, cli_ctx *ctx)
1491 1491
     return ret;
1492 1492
 }
1493 1493
 
1494
+static int cli_scan_structured(int desc, cli_ctx *ctx)
1495
+{
1496
+	char buf[8192];
1497
+	int result = 0;
1498
+	unsigned int cc_count = 0;
1499
+	unsigned int ssn_count = 0;
1500
+	int done = 0;
1501
+	const struct cl_limits *lim = NULL;
1502
+	int (*ccfunc)(const unsigned char *buffer, int length);
1503
+	int (*ssnfunc)(const unsigned char *buffer, int length);
1504
+
1505
+
1506
+    if(ctx == NULL || ctx->limits == NULL)
1507
+	return CL_ENULLARG;
1508
+
1509
+    lim = ctx->limits;
1510
+
1511
+    if(lim->min_cc_count == 1)
1512
+	ccfunc = dlp_has_cc;
1513
+    else
1514
+	ccfunc = dlp_get_cc_count;
1515
+
1516
+    ssnfunc = dlp_get_ssn_count;;
1517
+
1518
+    switch(lim->structured_flags) {
1519
+
1520
+	case CL_STRUCTURED_CONF_SSN_BOTH:
1521
+	    if(lim->min_ssn_count == 1)
1522
+		ssnfunc = dlp_has_ssn;
1523
+	    else
1524
+		ssnfunc = dlp_get_ssn_count;
1525
+	    break;
1526
+
1527
+	case CL_STRUCTURED_CONF_SSN_NORMAL:
1528
+	    if(lim->min_ssn_count == 1)
1529
+		ssnfunc = dlp_has_normal_ssn;
1530
+	    else
1531
+		ssnfunc = dlp_get_normal_ssn_count;
1532
+	    break;
1533
+
1534
+	case CL_STRUCTURED_CONF_SSN_STRIPPED:
1535
+	    if(lim->min_ssn_count == 1)
1536
+		ssnfunc = dlp_has_stripped_ssn;
1537
+	    else
1538
+		ssnfunc = dlp_get_stripped_ssn_count;
1539
+	    break;
1540
+    }
1541
+
1542
+    while(((result = cli_readn(desc, buf, 8191)) > 0) && !done) {
1543
+	if((cc_count += ccfunc((const unsigned char *)buf, result)) >= lim->min_cc_count)
1544
+	    done = 1;
1545
+	if((ssn_count += ssnfunc((const unsigned char *)buf, result)) >= lim->min_ssn_count)
1546
+	    done = 1;
1547
+    }
1548
+
1549
+    if(cc_count != 0 && cc_count >= lim->min_cc_count) {
1550
+	cli_dbgmsg("cli_scan_structured: %u credit card numbers detected\n", cc_count);
1551
+	*ctx->virname = "Structured.CreditCardNumber";
1552
+	return CL_VIRUS;
1553
+    }
1554
+
1555
+    if(ssn_count != 0 && ssn_count > lim->min_ssn_count) {
1556
+	cli_dbgmsg("cli_scan_structured: %u social security numbers detected\n", ssn_count);
1557
+	*ctx->virname = "Structured.SSN";
1558
+	return CL_VIRUS;
1559
+    }
1560
+
1561
+    return CL_CLEAN;
1562
+}
1563
+
1494 1564
 static int cli_scanembpe(int desc, cli_ctx *ctx)
1495 1565
 {
1496 1566
 	int fd, bytes, ret = CL_CLEAN;
... ...
@@ -1918,12 +1989,22 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
1918 1918
 	    ret = cli_check_mydoom_log(desc, ctx->virname);
1919 1919
 	    break;
1920 1920
 
1921
+	case CL_TYPE_TEXT_ASCII:
1922
+	    if(SCAN_STRUCTURED)
1923
+		/* TODO: consider calling this from cli_scanscript() for
1924
+		 * a normalised text
1925
+		 */
1926
+		ret = cli_scan_structured(desc, ctx);
1927
+	    break;
1928
+
1921 1929
 	default:
1922 1930
 	    break;
1923 1931
     }
1924
-
1925 1932
     ctx->recursion--;
1926 1933
 
1934
+    if(ret == CL_VIRUS)
1935
+	return CL_VIRUS;
1936
+
1927 1937
     if(type == CL_TYPE_ZIP && SCAN_ARCHIVE && (DCONF_ARCH & ARCH_CONF_ZIP)) {
1928 1938
 	if(sb.st_size > 1048576) {
1929 1939
 	    cli_dbgmsg("cli_magic_scandesc: Not checking for embedded PEs (zip file > 1 MB)\n");
... ...
@@ -1932,7 +2013,7 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
1932 1932
     }
1933 1933
 
1934 1934
     /* CL_TYPE_HTML: raw HTML files are not scanned, unless safety measure activated via DCONF */
1935
-    if(type != CL_TYPE_IGNORED && (type != CL_TYPE_HTML || !(DCONF_DOC & DOC_CONF_HTML_SKIPRAW)) && ret != CL_VIRUS && !ctx->engine->sdb) {
1935
+    if(type != CL_TYPE_IGNORED && (type != CL_TYPE_HTML || !(DCONF_DOC & DOC_CONF_HTML_SKIPRAW)) && !ctx->engine->sdb) {
1936 1936
 	if(cli_scanraw(desc, ctx, type, typercg, &dettype) == CL_VIRUS)
1937 1937
 	    return CL_VIRUS;
1938 1938
     }