...
|
...
|
@@ -1,15 +1,14 @@
|
1
|
1
|
%% LyX 1.5.3 created this file. For more info, see http://www.lyx.org/.
|
2
|
2
|
%% Do not edit unless you really know what you are doing.
|
3
|
|
-\documentclass[a4paper,english]{article}
|
4
|
|
-\usepackage{mathptmx}
|
5
|
|
-\usepackage[T1]{fontenc}
|
6
|
|
-\usepackage{varioref}
|
7
|
|
-\usepackage{prettyref}
|
|
3
|
+\documentclass[a4paper,english,12pt]{article}
|
8
|
4
|
\usepackage{amssymb}
|
9
|
5
|
\usepackage{pslatex}
|
|
6
|
+\usepackage[T1]{fontenc}
|
10
|
7
|
\usepackage[dvips]{graphicx}
|
11
|
|
-\usepackage{wrapfig}
|
12
|
8
|
\usepackage{url}
|
|
9
|
+\usepackage{fancyhdr}
|
|
10
|
+\usepackage{varioref}
|
|
11
|
+\usepackage{prettyref}
|
13
|
12
|
\date{}
|
14
|
13
|
|
15
|
14
|
\begin{document}
|
...
|
...
|
@@ -18,6 +17,8 @@
|
18
|
18
|
\author{T\"or\"ok Edwin}
|
19
|
19
|
\maketitle
|
20
|
20
|
|
|
21
|
+%TODO: define a LaTeX command, instead of using \textsc{RealURL} each time
|
|
22
|
+
|
21
|
23
|
\section{Database file format}
|
22
|
24
|
|
23
|
25
|
\subsection{PDB format}
|
...
|
...
|
@@ -42,9 +43,9 @@ H[Filter]:DisplayedHostname[:FuncLevelSpec]
|
42
|
42
|
\item for details on how to construct a flag number see section \prettyref{sec:Flags}
|
43
|
43
|
\end{itemize}
|
44
|
44
|
|
45
|
|
- \item [{RealURL }] is the URL the user is sent to
|
46
|
|
- \item [{DisplayedURL}] is the URL description displayed to the user, that is where it is \emph{claimed} they are sent, the most obvious example is that of an html anchor (<a>tag): its href attribute is the \textsc{realURL}, and its contents is the \textsc{displayedURL}
|
47
|
|
- \item [{DisplayedHostname}] is the hostname portion of the [{DisplayedURL}]
|
|
45
|
+ \item [{\textsc{RealURL}}] is the URL the user is sent to, example: \emph{href} attribute of an html anchor (\emph{<a> tag})
|
|
46
|
+ \item [{\textsc{DisplayedURL}}] is the URL description displayed to the user, where its \emph{claimed} they are sent, example: contents of an html anchor (\emph{<a> tag})
|
|
47
|
+ \item [{DisplayedHostname}] is the hostname portion of the \textsc{DisplayedURL}
|
48
|
48
|
\item [{FuncLevelSpec}] an (optional) functionality level, 2 formats are possible:
|
49
|
49
|
\begin{itemize}
|
50
|
50
|
\item \verb+minlevel+ all engines having functionality level >= \verb+minlevel+ will load this line
|
...
|
...
|
@@ -61,13 +62,13 @@ M:RealHostname:DisplayedHostname[:FuncLevelSpec]
|
61
|
61
|
\end{verbatim}
|
62
|
62
|
|
63
|
63
|
\begin{description}
|
64
|
|
- \item [{X}] regular expression, for the \textsc{entire URL}, not just the hostname
|
|
64
|
+ \item [{X}] regular expression, for the \emph{entire URL}, not just the hostname
|
65
|
65
|
\begin{itemize}
|
66
|
66
|
\item The regular expression is by default anchored to start-of-line and end-of-line, as if you have used \verb+^RegularExpression$+
|
67
|
67
|
\item A trailing \verb+/+ is automatically added both to the regex, and the input string to avoid false matches
|
68
|
|
- \item The regular expression matches the \textsc{concatenation} of RealURL, a colon(\verb+:+), and DisplayedURL as a single string. It doesn't separately match RealURL and DisplayedURL!
|
|
68
|
+ \item The regular expression matches the \emph{concatenation} of the \textsc{RealURL}, a colon(\verb+:+), and the \textsc{DisplayedURL} as a single string. It doesn't separately match \textsc{RealURL} and \textsc{DisplayedURL}!
|
69
|
69
|
\end{itemize}
|
70
|
|
- \item [{M}] matches hostname, or subdomain of it, see notes for \textsc{H} above
|
|
70
|
+ \item [{M}] matches hostname, or subdomain of it, see notes for {H} above
|
71
|
71
|
\end{description}
|
72
|
72
|
|
73
|
73
|
\subsection{Hints}
|
...
|
...
|
@@ -80,57 +81,166 @@ M:RealHostname:DisplayedHostname[:FuncLevelSpec]
|
80
|
80
|
\item see section \vref{sub:Extraction-of-realURL,} for more details on \textsc{realURL/displayedURL}
|
81
|
81
|
\end{itemize}
|
82
|
82
|
|
83
|
|
-%TODO: give up-to-date examples
|
84
|
83
|
|
85
|
|
-\subsubsection{Example}
|
|
84
|
+%TODO: move these to proper chapter
|
|
85
|
+\subsection{Examples of PDB signatures}
|
|
86
|
+To check for phishing mails that target amazon.com, or subdomains of amazon.com:
|
|
87
|
+\begin{verbatim}
|
|
88
|
+H:amazon.com
|
|
89
|
+\end{verbatim}
|
86
|
90
|
|
87
|
|
-The following line:
|
|
91
|
+To do the same, but for amazon.co.uk:
|
|
92
|
+\begin{verbatim}
|
|
93
|
+H:amazon.co.uk
|
|
94
|
+\end{verbatim}
|
88
|
95
|
|
89
|
|
-\emph{R http://www\textbackslash{}.google\textbackslash{}.(com|ro|it)
|
90
|
|
-www\textbackslash{}.google\textbackslash{}.com}
|
|
96
|
+To limit the signatures to certain engine versions:
|
|
97
|
+\begin{verbatim}
|
|
98
|
+H:amazon.co.uk:20-30
|
|
99
|
+H:amazon.co.uk:20-
|
|
100
|
+H:amazon.co.uk:0-20
|
|
101
|
+\end{verbatim}
|
|
102
|
+First line: engine versions 20, 21, ..., 29 can load it
|
91
|
103
|
|
92
|
|
-Means: \emph{\textsc{R}}\textsc{ }- this is a regex.
|
|
104
|
+Second line: engine versions >= 20 can load it
|
93
|
105
|
|
94
|
|
-Example of url pairs matching: http://www.google.com www.google.com,
|
95
|
|
-http://www.google.it www.google.com.
|
|
106
|
+Third line: engine versions < 20 can load it
|
96
|
107
|
|
97
|
|
-Example of url pairs not matching: http://www.google.c0m www.google.com
|
|
108
|
+In a real situation, you'd probably use the second form. A situation like that would be if you are using a feature of the signatures
|
|
109
|
+not available in earlier versions, or if earlier versions have bugs with your signature. Its neither case here, the above examples
|
|
110
|
+are for illustrative purposes only.
|
98
|
111
|
|
|
112
|
+\subsection{Examples of WDB signatures}
|
|
113
|
+To allow amazon's country specific domains and amazon.com, to mix domain names in \textsc{DisplayedURL}, and \textsc{RealURL}:
|
|
114
|
+\begin{verbatim}
|
|
115
|
+X:.+\.amazon\.(at|ca|co\.uk|co\.jp|de|fr)([/?].*)?:.+\.amazon\.com([/?].*)?:17-
|
|
116
|
+\end{verbatim}
|
|
117
|
+Explanation of this signature:
|
|
118
|
+\begin{description}
|
|
119
|
+ \item [{X:}] this is a regular expression
|
|
120
|
+ \item [{:17-}] load signature only for engines with functionality level >= 17 (recommended for type X)
|
|
121
|
+\end{description}
|
99
|
122
|
|
100
|
|
-\subsection{How matching works}
|
|
123
|
+The regular expression is the following (X:, :17- stripped, and a / appended)
|
|
124
|
+\begin{verbatim}
|
|
125
|
+.+\.amazon\.(at|ca|co\.uk|co\.jp|de|fr)([/?].*)?:.+\.amazon\.com([/?].*)?/
|
|
126
|
+\end{verbatim}
|
101
|
127
|
|
|
128
|
+Explanation of this regular expression (note that it is a single regular expression, and not 2 regular
|
|
129
|
+expressions splitted at the {:}).
|
|
130
|
+\begin{itemize}
|
|
131
|
+ \item \verb;.+; any subdomain of
|
|
132
|
+ \item \verb;\.amazon\.; domain we are whitelisting (\textsc{RealURL} part)
|
|
133
|
+ \item \verb;(at|ca|co\.uk|co\.jp|de|fr); country-domains: at, ca, co.uk, co.jp, de, fr
|
|
134
|
+ \item \verb;([/?].*)?; recomended way to end real url part of whitelist, this protects against embedded URLs (evilurl.example.com/amazon.co.uk/)
|
|
135
|
+ \item \verb;:; \textsc{RealURL} and \textsc{DisplayedURL} are concatenated via a {:}, so match a literal {:} here
|
|
136
|
+ \item \verb;.+; any subdomain of
|
|
137
|
+ \item \verb;\.amazon\.com; whitelisted DisplayedURL
|
|
138
|
+ \item \verb;([/?].*)?; recommended way to end displayed url part, to protect against embedded URLs
|
|
139
|
+ \item \verb;/; automatically added to further protect against embedded URLs
|
|
140
|
+\end{itemize}
|
102
|
141
|
|
103
|
|
-\subsubsection{RealURL, displayedURL concatenation\label{sub:RealURL,-displayedURL-concatenation}}
|
|
142
|
+When you whitelist an entry make sure you check that both domains are owned by the same entity.
|
|
143
|
+What this whitelist entry allows is:
|
|
144
|
+Links claiming to point to amazon.com (\textsc{DisplayedURL}), but really go to country-specific domain of amazon (\textsc{RealURL}).
|
104
|
145
|
|
105
|
|
-The phishing detection module processes pairs of realURL/displayedURL,
|
106
|
|
-and the matching against daily.wdb/daily.pdb is done as follows: the
|
107
|
|
-realURL is concatenated with a space, and with the displayedURL, then
|
108
|
|
-that \emph{line} is matched against the lines in daily.wdb/daily.pdb
|
109
|
146
|
|
110
|
|
-So if you have a line like
|
|
147
|
+\subsection{Example for how the URL extractor works}
|
|
148
|
+Consider the following HTML file:
|
|
149
|
+\begin{verbatim}
|
|
150
|
+<html>
|
|
151
|
+<a href="http://1.realurl.example.com/">
|
|
152
|
+ 1.displayedurl.example.com
|
|
153
|
+</a>
|
|
154
|
+<a href="http://2.realurl.example.com">
|
|
155
|
+ 2 d<b>i<p>splayedurl.e</b>xa<i>mple.com
|
|
156
|
+</a>
|
|
157
|
+<a href="http://3.realurl.example.com">
|
|
158
|
+ 3.nested.example.com
|
|
159
|
+ <a href="http://4.realurl.example.com">
|
|
160
|
+ 4.displayedurl.example.com
|
|
161
|
+ </a>
|
|
162
|
+</a>
|
|
163
|
+<form action="http://5.realurl.example.com">
|
|
164
|
+ sometext
|
|
165
|
+ <img src="http://5.displayedurl.example.com/img0.gif"/>
|
|
166
|
+ <a href="http://5.form.nested.displayedurl.example.com">
|
|
167
|
+ 5.form.nested.link-displayedurl.example.com
|
|
168
|
+ </a>
|
|
169
|
+</form>
|
|
170
|
+<a href="http://6.realurl.example.com">
|
|
171
|
+ 6.displ
|
|
172
|
+ <img src="6.displayedurl.example.com/img1.gif"/>
|
|
173
|
+ ayedurl.example.com
|
|
174
|
+</a>
|
|
175
|
+<a href="http://7.realurl.example.com">
|
|
176
|
+ <iframe src="http://7.displayedurl.example.com">
|
|
177
|
+</a>
|
|
178
|
+\end{verbatim}
|
|
179
|
+
|
|
180
|
+The phishing engine extract the following \textsc{RealURL/DisplayedURL} pairs from it:
|
|
181
|
+\begin{verbatim}
|
|
182
|
+http://1.realurl.example.com/
|
|
183
|
+1.displayedurl.example.com
|
|
184
|
+
|
|
185
|
+http://2.realurl.example.com
|
|
186
|
+2displayedurl.example.com
|
|
187
|
+
|
|
188
|
+http://3.realurl.example.com
|
|
189
|
+3.nested.example.com
|
|
190
|
+
|
|
191
|
+http://4.realurl.example.com
|
|
192
|
+4.displayedurl.example.com
|
|
193
|
+
|
|
194
|
+http://5.realurl.example.com
|
|
195
|
+http://5.displayedurl.example.com/img0.gif
|
|
196
|
+
|
|
197
|
+http://5.realurl.example.com
|
|
198
|
+http://5.form.nested.displayedurl.example.com
|
|
199
|
+
|
|
200
|
+http://5.form.nested.displayedurl.example.com
|
|
201
|
+5.form.nested.link-displayedurl.example.com
|
|
202
|
+
|
|
203
|
+http://6.realurl.example.com
|
|
204
|
+6.displayedurl.example.com
|
|
205
|
+
|
|
206
|
+http://6.realurl.example.com
|
|
207
|
+6.displayedurl.example.com/img1.gif
|
|
208
|
+\end{verbatim}
|
111
|
209
|
|
112
|
|
-\textit{~www.google.ro~www.google.com}
|
113
|
210
|
|
114
|
|
-and a href like: \emph{<a href=''http://www.google.ro''>www.google.com</a>,}
|
115
|
|
-then it will match, but: \emph{<a href=''http://images.google.com''>www.google.com</a>}
|
116
|
|
-will not match.
|
|
211
|
+\subsection{How matching works}
|
|
212
|
+
|
|
213
|
+\subsubsection{RealURL, displayedURL concatenation\label{sub:RealURL,-displayedURL-concatenation}}
|
117
|
214
|
|
118
|
|
-If you use the \textbf{\textsc{H}} flag, then the 2nd href will match
|
119
|
|
-too.
|
|
215
|
+The phishing detection module processes pairs of \textsc{RealURL/DisplayedURL}.
|
|
216
|
+Matching against daily.wdb is done as follows: the \textsc{realURL} is concatenated with a \verb+:+, and with the \textsc{DisplayedURL}, then that \emph{line} is matched against the lines in daily.wdb/daily.pdb
|
|
217
|
+
|
|
218
|
+So if you have this line in daily.wdb:
|
|
219
|
+\begin{verbatim}
|
|
220
|
+M:www.google.ro:www.google.com
|
|
221
|
+\end{verbatim}
|
120
|
222
|
|
|
223
|
+and this href: \verb+<a href='http://www.google.ro'>www.google.com</a>+
|
|
224
|
+then it will be whitelisted, but: \verb+<a href='http://images.google.com'>www.google.com</a>+
|
|
225
|
+will not.
|
121
|
226
|
|
|
227
|
+%TODO: review & update these chapters
|
122
|
228
|
\subsubsection{What happens when a match is found}
|
123
|
229
|
|
124
|
|
-In the case of the whitelist, a match means that the realURL/displayedURL
|
|
230
|
+In the case of the whitelist, a match means that the \textsc{RealURL/DisplayedURL}
|
125
|
231
|
combination is considered \textsc{clean}, and no further checks are
|
126
|
232
|
performed on it.
|
127
|
233
|
|
128
|
|
-In the case of the domainlist, a match means that the realURL/displayedURL
|
129
|
|
-is going to be checked for phishing attempts. This is only done if
|
130
|
|
-you don't run clamav with the \emph{alldomains} option (since then
|
131
|
|
-all urls are checked). Furthermore you can restrict what checks are
|
132
|
|
-to be performed by specifying the 3-digit hexnumber.
|
|
234
|
+In the case of the domainlist, a match means that the \textsc{RealURL/displayedURL}
|
|
235
|
+is going to be checked for phishing attempts.
|
133
|
236
|
|
|
237
|
+%TODO: this is gone in SVN, but still present in 0.92, drop from documentation?
|
|
238
|
+This is only done if you don't run clamav with the \emph{alldomains} option (since then
|
|
239
|
+all urls are checked).
|
|
240
|
+%---
|
|
241
|
+Furthermore you can restrict what checks are to be performed by specifying the 3-digit hexnumber.
|
|
242
|
+%TODO: add section reference here
|
134
|
243
|
|
135
|
244
|
\subsubsection{Extraction of \textsc{realURL}, \textsc{displayedURL} from HTML tags\label{sub:Extraction-of-realURL,}}
|
136
|
245
|
|
...
|
...
|
@@ -159,7 +269,7 @@ if nested withing a \emph{form} tag, then the action attribute of
|
159
|
159
|
the \emph{form} tag is the \textsc{realURL}
|
160
|
160
|
|
161
|
161
|
\item [{iframe}] if nested withing an \emph{<a>} tag the \emph{src} attribute
|
162
|
|
-is the displayedURL, and the \emph{href} of its parent \emph{a} tag
|
|
162
|
+is the \textsc{displayedURL}, and the \emph{href} of its parent \emph{a} tag
|
163
|
163
|
is the \textsc{realURL}
|
164
|
164
|
|
165
|
165
|
|
...
|
...
|
@@ -237,7 +347,7 @@ Currently the clamav regex matcher supports:
|
237
|
237
|
|
238
|
238
|
\begin{itemize}
|
239
|
239
|
\item . (dot) character
|
240
|
|
-\item \textbackslash{} (escaping special characters)
|
|
240
|
+\item $\backslash$ (escaping special characters)
|
241
|
241
|
\item | (pipe) alternatives
|
242
|
242
|
\item {[}] (character classes)
|
243
|
243
|
\item () (paranthesis for grouping, but no group extraction is performed)
|
...
|
...
|
@@ -260,7 +370,7 @@ from the first unsupported token, everything before that is still
|
260
|
260
|
processed by the internal matcher). An example might make this more
|
261
|
261
|
clear:
|
262
|
262
|
|
263
|
|
-\emph{www\textbackslash{}.google\textbackslash{}.(com|ro|it) ({[}a-zA-Z])+\textbackslash{}.google\textbackslash{}.(com|ro|it)}
|
|
263
|
+\emph{www$\backslash$.google$\backslash$.(com|ro|it) ({[}a-zA-Z])+$\backslash$.google$\backslash$.(com|ro|it)}
|
264
|
264
|
|
265
|
265
|
Everything till \emph{({[}a-zA-Z])+} is processed internally, that
|
266
|
266
|
paranthesis (and everything beyond) is processed by the posix core.
|
...
|
...
|
@@ -300,7 +410,9 @@ These constants are defined in libclamav/phishcheck.h, you can check
|
300
|
300
|
there for the latest flags.
|
301
|
301
|
|
302
|
302
|
There is a default set of flags that are enabled, these are currently:
|
303
|
|
-(CLEANUP\_URL|DOMAIN\_SUFFICIENT|CHECK\_SSL|CHECK\_CLOAKING|DOMAINLIST\_REQUIRED|CHECK\_IMG\_URL),
|
|
303
|
+\begin{verbatim}
|
|
304
|
+(CLEANUP\_URL|CHECK\_SSL|CHECK\_CLOAKING|CHECK\_IMG\_URL)
|
|
305
|
+\end{verbatim}
|
304
|
306
|
ssl checking is performed only for a tags currently.
|
305
|
307
|
|
306
|
308
|
You must decide for each line in the domainlist if you want to filter
|
...
|
...
|
@@ -331,7 +443,7 @@ Recomended reading:
|
331
|
331
|
\begin{description}
|
332
|
332
|
\item [{{[}}] the opening square bracket - it marks the beginning of a
|
333
|
333
|
character class, see section\vref{sub:Character-classes}
|
334
|
|
-\item [{\textbackslash{}}] the backslash - escapes special characters,
|
|
334
|
+\item [{$\backslash$}] the backslash - escapes special characters,
|
335
|
335
|
see section \vref{sub:Escaping}
|
336
|
336
|
\item [{\^{ }}] the caret - matches the beginning of a line (not needed
|
337
|
337
|
in clamav regexes, this is implied)
|
...
|
...
|
@@ -360,9 +472,9 @@ Escaping has two purposes:
|
360
|
360
|
|
361
|
361
|
\begin{itemize}
|
362
|
362
|
\item it allows you to actually match the special characters themselves,
|
363
|
|
-for example to match the literal \emph{+}, you would write \emph{\textbackslash{}+}
|
|
363
|
+for example to match the literal \emph{+}, you would write \emph{$\backslash$+}
|
364
|
364
|
\item it also allows you to match non-printable characters, such as the
|
365
|
|
-tab (\emph{\textbackslash{}t}), newline (\emph{\textbackslash{}n}),
|
|
365
|
+tab (\emph{$\backslash$t}), newline (\emph{$\backslash$n}),
|
366
|
366
|
..
|
367
|
367
|
\end{itemize}
|
368
|
368
|
However since non-printable characters are not valid inside an url,
|
...
|
...
|
@@ -401,14 +513,14 @@ not, you have 2 choices:
|
401
|
401
|
Lets assume you are having problems because of links like this in
|
402
|
402
|
a mail:
|
403
|
403
|
|
404
|
|
-\begin{quote}
|
|
404
|
+\begin{verbatim}
|
405
|
405
|
<a href=''http://69.0.241.57/bCentral/L.asp?L=XXXXXXXX''>http://www.bcentral.it/</a>
|
406
|
|
-\end{quote}
|
|
406
|
+\end{verbatim}
|
407
|
407
|
After investigating those sites further, you decide they are no threat,
|
408
|
408
|
and create a line like this in daily.wdb:
|
409
|
409
|
|
410
|
410
|
\begin{quote}
|
411
|
|
-R http://www\textbackslash{}.bcentral\textbackslash{}.it/.+ http://69\textbackslash{}.0\textbackslash{}.241\textbackslash{}.57/bCentral/L\textbackslash{}.asp?L=.+
|
|
411
|
+R http://www$\backslash$.bcentral$\backslash$.it/.+ http://69$\backslash$.0$\backslash$.241$\backslash$.57/bCentral/L$\backslash$.asp?L=.+
|
412
|
412
|
\end{quote}
|
413
|
413
|
Note: urls like the above can be used to track unique mail recipients,
|
414
|
414
|
and thus know if somebody actually reads mails (so they can send more
|
...
|
...
|
@@ -429,7 +541,7 @@ Lets assume that you've recently seen many phishing attempts claiming
|
429
|
429
|
they come from Paypal. Thus you need to add paypal to daily.pdb:
|
430
|
430
|
|
431
|
431
|
\begin{quote}
|
432
|
|
-R .+ .+\textbackslash{}.paypal\textbackslash{}.com
|
|
432
|
+R .+ .+$\backslash$.paypal$\backslash$.com
|
433
|
433
|
\end{quote}
|
434
|
434
|
The above line will block (detect as phishing) mails that contain
|
435
|
435
|
urls that claim to lead to paypal, but they don't in fact.
|
...
|
...
|
@@ -488,4 +600,4 @@ whitelist, see if all urls are detected, etc.
|
488
|
488
|
\section{Examples}
|
489
|
489
|
|
490
|
490
|
|
491
|
|
-\end{document}
|
|
491
|
+\end{document}
|
492
|
492
|
\ No newline at end of file
|