[grep] Fixed invalid UTF-8 byte sequence error in PCRE mode

Tue Nov 11 15:31:40 UTC 2014

commit 33ee7fe86e81e9c31db80c28fac4be924f819b68
Author: Jaroslav Škarvada <jskarvad at redhat.com>
Date:   Tue Nov 11 16:31:32 2014 +0100

    Fixed invalid UTF-8 byte sequence error in PCRE mode
    
      (by pcre-invalid-utf8-fix patch)
      Resolves: rhbz#1161832

 grep-2.20-pcre-invalid-utf8-fix.patch |  136 +++++++++++++++++++++++++++++++++
 grep.spec                             |   10 ++-
 2 files changed, 145 insertions(+), 1 deletions(-)
---

diff --git a/grep-2.20-pcre-invalid-utf8-fix.patch b/grep-2.20-pcre-invalid-utf8-fix.patch
new file mode 100644
index 0000000..5f7530f
--- /dev/null
+++ b/grep-2.20-pcre-invalid-utf8-fix.patch
@@ -0,0 +1,136 @@
+diff --git a/src/pcresearch.c b/src/pcresearch.c
+index 820dd00..11df488 100644
+--- a/src/pcresearch.c
++++ b/src/pcresearch.c
+@@ -136,34 +136,42 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+ #else
+   /* This array must have at least two elements; everything after that
+      is just for performance improvement in pcre_exec.  */
+-  int sub[300];
++  enum { nsub = 300 };
++  int sub[nsub];
+ 
+-  const char *line_buf, *line_end, *line_next;
++  char const *p = start_ptr ? start_ptr : buf;
++  int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL;
++  char const *line_start = buf;
+   int e = PCRE_ERROR_NOMATCH;
+-  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
++  char const *line_end;
+ 
+   /* PCRE can't limit the matching to single lines, therefore we have to
+      match each line in the buffer separately.  */
+-  for (line_next = buf;
+-       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
+-       start_ofs -= line_next - line_buf)
++  for (; p < buf + size; p = line_start = line_end + 1)
+     {
+-      line_buf = line_next;
+-      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
+-      if (line_end == NULL)
+-        line_next = line_end = buf + size;
+-      else
+-        line_next = line_end + 1;
+-
+-      if (start_ptr && start_ptr >= line_end)
+-        continue;
++      line_end = memchr (p, eolbyte, buf + size - p);
+ 
+-      if (INT_MAX < line_end - line_buf)
++      if (INT_MAX < line_end - p)
+         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
+ 
+-      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
+-                     start_ofs < 0 ? 0 : start_ofs, 0,
+-                     sub, sizeof sub / sizeof *sub);
++      /* Treat encoding-error bytes as data that cannot match.  */
++      for (;;)
++        {
++          e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub);
++          if (e != PCRE_ERROR_BADUTF8)
++            break;
++          e = pcre_exec (cre, extra, p, sub[0], 0,
++                         options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
++                         sub, nsub);
++          if (e != PCRE_ERROR_NOMATCH)
++            break;
++          p += sub[0] + 1;
++          options = PCRE_NOTBOL;
++        }
++
++      if (e != PCRE_ERROR_NOMATCH)
++        break;
++      options = 0;
+     }
+ 
+   if (e <= 0)
+@@ -180,10 +188,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+           error (EXIT_TROUBLE, 0,
+                  _("exceeded PCRE's backtracking limit"));
+ 
+-        case PCRE_ERROR_BADUTF8:
+-          error (EXIT_TROUBLE, 0,
+-                 _("invalid UTF-8 byte sequence in input"));
+-
+         default:
+           /* For now, we lump all remaining PCRE failures into this basket.
+              If anyone cares to provide sample grep usage that can trigger
+@@ -197,25 +201,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+     }
+   else
+     {
+-      /* Narrow down to the line we've found.  */
+-      char const *beg = line_buf + sub[0];
+-      char const *end = line_buf + sub[1];
+-      char const *buflim = buf + size;
+-      char eol = eolbyte;
+-      if (!start_ptr)
+-        {
+-          /* FIXME: The case when '\n' is not found indicates a bug:
+-             Since grep is line oriented, the match should never contain
+-             a newline, so there _must_ be a newline following.
+-           */
+-          if (!(end = memchr (end, eol, buflim - end)))
+-            end = buflim;
+-          else
+-            end++;
+-          while (buf < beg && beg[-1] != eol)
+-            --beg;
+-        }
+-
++      char const *beg = start_ptr ? p + sub[0] : line_start;
++      char const *end = start_ptr ? p + sub[1] : line_end + 1;
+       *match_size = end - beg;
+       return beg - buf;
+     }
+diff --git a/tests/pcre-infloop b/tests/pcre-infloop
+index 1b33e72..b92f8e1 100755
+--- a/tests/pcre-infloop
++++ b/tests/pcre-infloop
+@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_
+ fail=0
+ 
+ LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
+-test $? = 2 || fail_ "libpcre's match function appears to infloop"
++test $? = 1 || fail_ "libpcre's match function appears to infloop"
+ 
+ Exit $fail
+diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
+index 913e8ee..9da4b18 100755
+--- a/tests/pcre-invalid-utf8-input
++++ b/tests/pcre-invalid-utf8-input
+@@ -13,9 +13,12 @@ require_en_utf8_locale_
+ 
+ fail=0
+ 
+-printf 'j\202\nj\n' > in || framework_failure_
++printf 'j\202j\nj\nk\202\n' > in || framework_failure_
+ 
+ LC_ALL=en_US.UTF-8 grep -P j in
+-test $? -eq 2 || fail=1
++test $? -eq 0 || fail=1
++
++LC_ALL=en_US.UTF-8 grep -P 'k$' in
++test $? -eq 1 || fail=1
+ 
+ Exit $fail
diff --git a/grep.spec b/grep.spec
index 7458394..1784194 100644
--- a/grep.spec
+++ b/grep.spec
@@ -3,7 +3,7 @@
 Summary: Pattern matching utilities
 Name: grep
 Version: 2.20
-Release: 5%{?dist}
+Release: 6%{?dist}
 License: GPLv3+
 Group: Applications/Text
 Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz
@@ -15,6 +15,8 @@ Source4: grepconf.sh
 Patch0: grep-2.20-man-fix-gs.patch
 # upstream ticket 39445
 Patch1: grep-2.20-help-align.patch
+# backported from upstream
+Patch2: grep-2.20-pcre-invalid-utf8-fix.patch
 URL: http://www.gnu.org/software/grep/
 Requires(post): /sbin/install-info
 Requires(preun): /sbin/install-info
@@ -35,6 +37,7 @@ GNU grep is needed by many scripts, so it shall be installed on every system.
 %setup -q
 %patch0 -p1 -b .man-fix-gs
 %patch1 -p1 -b .help-align
+%patch2 -p1 -b .pcre-invalid-utf8-fix
 
 %build
 %global BUILD_FLAGS $RPM_OPT_FLAGS
@@ -90,6 +93,11 @@ fi
 %{_libexecdir}/grepconf.sh
 
 %changelog
+* Tue Nov 11 2014 Jaroslav Škarvada <jskarvad at redhat.com> - 2.20-6
+- Fixed invalid UTF-8 byte sequence error in PCRE mode
+  (by pcre-invalid-utf8-fix patch)
+  Resolves: rhbz#1161832
+
 * Wed Aug 20 2014 Jaroslav Škarvada <jskarvad at redhat.com> - 2.20-5
 - Added script to check whether grep is coloured
   Resolves: rhbz#1034631