From 0abf1e8d89aecd32dbdabda5da4d52a2d57a7cff Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 6 Feb 2018 14:50:48 -0700
Subject: [PATCH] [perl #132063]: Heap buffer overflow

The proximal cause is several instances in regexec.c of the code
assuming that the input was valid UTF-8, whereas the input was too short
for what the start byte claimed it would be.

I grepped through the core for any other similar uses, and did not find
any.

(cherry picked from commit fe7d8ba0a1bf567af8fa8fea128e2b9f4c553e84)
---
 regexec.c              | 29 ++++++++++++++++-------------
 t/lib/warnings/regexec |  7 +++++++
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/regexec.c b/regexec.c
index 5735b997fd..ea432c39d3 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1466,7 +1466,9 @@ Perl_re_intuit_start(pTHX_
                                            ? trie_utf8_fold                         \
                                            :   trie_latin_utf8_fold)))
 
-#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
+/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
+ * 'foldbuf+sizeof(foldbuf)' */
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
 STMT_START {                                                                        \
     STRLEN skiplen;                                                                 \
     U8 flags = FOLD_FLAGS_FULL;                                                     \
@@ -1474,7 +1476,7 @@ STMT_START {
     case trie_flu8:                                                                 \
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
-            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end - uc);                \
         }                                                                           \
         goto do_trie_utf8_fold;                                                     \
     case trie_utf8_exactfa_fold:                                                    \
@@ -1483,7 +1485,7 @@ STMT_START {
     case trie_utf8_fold:                                                            \
       do_trie_utf8_fold:                                                            \
         if ( foldlen>0 ) {                                                          \
-            uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+            uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
             foldlen -= len;                                                         \
             uscan += len;                                                           \
             len=0;                                                                  \
@@ -1500,7 +1502,7 @@ STMT_START {
         /* FALLTHROUGH */                                                           \
     case trie_latin_utf8_fold:                                                      \
         if ( foldlen>0 ) {                                                          \
-            uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+            uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
             foldlen -= len;                                                         \
             uscan += len;                                                           \
             len=0;                                                                  \
@@ -1519,7 +1521,7 @@ STMT_START {
         }                                                                           \
         /* FALLTHROUGH */                                                           \
     case trie_utf8:                                                                 \
-        uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
+        uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
         break;                                                                      \
     case trie_plain:                                                                \
         uvc = (UV)*uc;                                                              \
@@ -2599,10 +2601,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     }
                     points[pointpos++ % maxlen]= uc;
                     if (foldlen || uc < (U8*)strend) {
-                        REXEC_TRIE_READ_CHAR(trie_type, trie,
-                                         widecharmap, uc,
-                                         uscan, len, uvc, charid, foldlen,
-                                         foldbuf, uniflags);
+                        REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
+                                             (U8 *) strend, uscan, len, uvc,
+                                             charid, foldlen, foldbuf,
+                                             uniflags);
                         DEBUG_TRIE_EXECUTE_r({
                             dump_exec_pos( (char *)uc, c, strend,
                                         real_start, s, utf8_target, 0);
@@ -5511,8 +5513,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		    if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
 			I32 offset;
 			REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
-					     uscan, len, uvc, charid, foldlen,
-					     foldbuf, uniflags);
+                                             (U8 *) reginfo->strend, uscan,
+                                             len, uvc, charid, foldlen,
+                                             foldbuf, uniflags);
 			charcount++;
 			if (foldlen>0)
 			    ST.longfold = TRUE;
@@ -5642,8 +5645,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 			while (foldlen) {
 			    if (!--chars)
 				break;
-			    uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
-					    uniflags);
+			    uvc = utf8n_to_uvchr(uscan, foldlen, &len,
+                                                 uniflags);
 			    uscan += len;
 			    foldlen -= len;
 			}
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec
index 900dd6ee7f..6635142dea 100644
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -260,3 +260,10 @@ setlocale(&POSIX::LC_CTYPE, $utf8_locale);
 "k" =~ /(?[ \N{KELVIN SIGN} ])/i;
 ":" =~ /(?[ \: ])/;
 EXPECT
+########
+# NAME perl #132063, read beyond buffer end
+# OPTION fatal
+"\xff" =~ /(?il)\x{100}|\x{100}/;
+EXPECT
+Malformed UTF-8 character: \xff (too short; 1 byte available, need 13) in pattern match (m//) at - line 2.
+Malformed UTF-8 character (fatal) at - line 2.
-- 
2.15.1-424-g9478a660812