[pcre] Fix caseless reference
Petr Pisar
ppisar at fedoraproject.org
Mon May 9 12:00:38 UTC 2011
commit 54a4973709c4a6c5e87fdf658ed7e21198d0ee9d
Author: Petr Písař <ppisar at redhat.com>
Date: Mon May 9 13:58:00 2011 +0200
Fix caseless reference
pcre-8.12-caseless_reference.patch | 552 ++++++++++++++++++++++++++++++++++++
pcre.spec | 9 +-
2 files changed, 560 insertions(+), 1 deletions(-)
---
diff --git a/pcre-8.12-caseless_reference.patch b/pcre-8.12-caseless_reference.patch
new file mode 100644
index 0000000..1ffd957
--- /dev/null
+++ b/pcre-8.12-caseless_reference.patch
@@ -0,0 +1,552 @@
+r595 | ph10 | 2011-05-02 12:33:29 +0200 (Po, 02 kvě 2011) | 3 lines
+Fix problems with caseless reference matching in UTF-8 mode when the
+upper/lower case characters have different lengths.
+
+and
+
+r597 | ph10 | 2011-05-02 19:08:52 +0200 (Po, 02 kvě 2011) | 2 lines
+Complete incomplete fix for UTF-8 caseless references of different lengths.
+
+http://bugs.exim.org/show_bug.cgi?id=1074
+
+Petr Pisar: Changelog and comment changes removed.
+
+Index: testdata/testoutput12
+===================================================================
+--- testdata/testoutput12 (revision 594)
++++ testdata/testoutput12 (revision 595)
+@@ -1176,4 +1176,64 @@
+ End
+ ------------------------------------------------------------------
+
++/-- These behaved oddly in Perl, so they are kept in this test --/
++
++/(\x{23a}\x{23a}\x{23a})?\1/8i
++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
++No match
++
++/(ȺȺȺ)?\1/8i
++ ȺȺȺⱥⱥ
++No match
++
++/(\x{23a}\x{23a}\x{23a})?\1/8i
++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
++ 1: \x{23a}\x{23a}\x{23a}
++
++/(ȺȺȺ)?\1/8i
++ ȺȺȺⱥⱥⱥ
++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
++ 1: \x{23a}\x{23a}\x{23a}
++
++/(\x{23a}\x{23a}\x{23a})\1/8i
++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
++No match
++
++/(ȺȺȺ)\1/8i
++ ȺȺȺⱥⱥ
++No match
++
++/(\x{23a}\x{23a}\x{23a})\1/8i
++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
++ 1: \x{23a}\x{23a}\x{23a}
++
++/(ȺȺȺ)\1/8i
++ ȺȺȺⱥⱥⱥ
++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
++ 1: \x{23a}\x{23a}\x{23a}
++
++/(\x{2c65}\x{2c65})\1/8i
++ \x{2c65}\x{2c65}\x{23a}\x{23a}
++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
++ 1: \x{2c65}\x{2c65}
++
++/(ⱥⱥ)\1/8i
++ ⱥⱥȺȺ
++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
++ 1: \x{2c65}\x{2c65}
++
++/(\x{23a}\x{23a}\x{23a})\1Y/8i
++ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}Y
++ 1: \x{23a}\x{23a}\x{23a}
++
++/(\x{2c65}\x{2c65})\1Y/8i
++ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
++ 1: \x{2c65}\x{2c65}
++
++/-- --/
++
+ /-- End of testinput12 --/
+Index: testdata/testinput12
+===================================================================
+--- testdata/testinput12 (revision 594)
++++ testdata/testinput12 (revision 595)
+@@ -503,4 +503,44 @@
+
+ /A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
+
++/-- These behaved oddly in Perl, so they are kept in this test --/
++
++/(\x{23a}\x{23a}\x{23a})?\1/8i
++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
++
++/(ȺȺȺ)?\1/8i
++ ȺȺȺⱥⱥ
++
++/(\x{23a}\x{23a}\x{23a})?\1/8i
++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
++
++/(ȺȺȺ)?\1/8i
++ ȺȺȺⱥⱥⱥ
++
++/(\x{23a}\x{23a}\x{23a})\1/8i
++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
++
++/(ȺȺȺ)\1/8i
++ ȺȺȺⱥⱥ
++
++/(\x{23a}\x{23a}\x{23a})\1/8i
++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
++
++/(ȺȺȺ)\1/8i
++ ȺȺȺⱥⱥⱥ
++
++/(\x{2c65}\x{2c65})\1/8i
++ \x{2c65}\x{2c65}\x{23a}\x{23a}
++
++/(ⱥⱥ)\1/8i
++ ⱥⱥȺȺ
++
++/(\x{23a}\x{23a}\x{23a})\1Y/8i
++ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
++
++/(\x{2c65}\x{2c65})\1Y/8i
++ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
++
++/-- --/
++
+ /-- End of testinput12 --/
+Index: pcre_exec.c
+===================================================================
+--- pcre_exec.c (revision 594)
++++ pcre_exec.c (revision 595)
+@@ -132,24 +132,27 @@
+ * Match a back-reference *
+ *************************************************/
+
+-/* If a back reference hasn't been set, the length that is passed is greater
+-than the number of characters left in the string, so the match fails.
++/* Normally, if a back reference hasn't been set, the length that is passed is
++negative, so the match always fails. However, in JavaScript compatibility mode,
++the length passed is zero. Note that in caseless UTF-8 mode, the number of
++subject bytes matched may be different to the number of reference bytes.
+
+ Arguments:
+ offset index into the offset vector
+- eptr points into the subject
+- length length to be matched
++ eptr pointer into the subject
++ length length of reference to be matched (number of bytes)
+ md points to match data block
+ ims the ims flags
+
+-Returns: TRUE if matched
++Returns: < 0 if not matched, otherwise the number of subject bytes matched
+ */
+
+-static BOOL
++static int
+ match_ref(int offset, register USPTR eptr, int length, match_data *md,
+ unsigned long int ims)
+ {
+-USPTR p = md->start_subject + md->offset_vector[offset];
++USPTR eptr_start = eptr;
++register USPTR p = md->start_subject + md->offset_vector[offset];
+
+ #ifdef PCRE_DEBUG
+ if (eptr >= md->end_subject)
+@@ -164,9 +167,9 @@
+ printf("\n");
+ #endif
+
+-/* Always fail if not enough characters left */
++/* Always fail if reference not set (and not JavaScript compatible). */
+
+-if (length > md->end_subject - eptr) return FALSE;
++if (length < 0) return -1;
+
+ /* Separate the caseless case for speed. In UTF-8 mode we can only do this
+ properly if Unicode properties are supported. Otherwise, we can check only
+@@ -178,13 +181,21 @@
+ #ifdef SUPPORT_UCP
+ if (md->utf8)
+ {
+- USPTR endptr = eptr + length;
+- while (eptr < endptr)
++ /* Match characters up to the end of the reference. NOTE: the number of
++ bytes matched may differ, because there are some characters whose upper and
++ lower case versions code as different numbers of bytes. For example, U+023A
++ (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
++ a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
++ the latter. It is important, therefore, to check the length along the
++ reference, not along the subject (earlier code did this wrong). */
++
++ USPTR endptr = p + length;
++ while (p < endptr)
+ {
+ int c, d;
+ GETCHARINC(c, eptr);
+ GETCHARINC(d, p);
+- if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
++ if (c != d && c != UCD_OTHERCASE(d)) return -1;
+ }
+ }
+ else
+@@ -195,16 +206,16 @@
+ is no UCP support. */
+
+ while (length-- > 0)
+- { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
++ { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
+ }
+
+ /* In the caseful case, we can just compare the bytes, whether or not we
+ are in UTF-8 mode. */
+
+ else
+- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
++ { while (length-- > 0) if (*p++ != *eptr++) return -1; }
+
+-return TRUE;
++return eptr - eptr_start;
+ }
+
+
+@@ -2252,129 +2263,129 @@
+ loops). */
+
+ case OP_REF:
+- {
+- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
+- ecode += 3;
++ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
++ ecode += 3;
+
+- /* If the reference is unset, there are two possibilities:
++ /* If the reference is unset, there are two possibilities:
+
+- (a) In the default, Perl-compatible state, set the length to be longer
+- than the amount of subject left; this ensures that every attempt at a
+- match fails. We can't just fail here, because of the possibility of
+- quantifiers with zero minima.
++ (a) In the default, Perl-compatible state, set the length negative;
++ this ensures that every attempt at a match fails. We can't just fail
++ here, because of the possibility of quantifiers with zero minima.
+
+- (b) If the JavaScript compatibility flag is set, set the length to zero
+- so that the back reference matches an empty string.
++ (b) If the JavaScript compatibility flag is set, set the length to zero
++ so that the back reference matches an empty string.
+
+- Otherwise, set the length to the length of what was matched by the
+- referenced subpattern. */
++ Otherwise, set the length to the length of what was matched by the
++ referenced subpattern. */
+
+- if (offset >= offset_top || md->offset_vector[offset] < 0)
+- length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
+- else
+- length = md->offset_vector[offset+1] - md->offset_vector[offset];
++ if (offset >= offset_top || md->offset_vector[offset] < 0)
++ length = (md->jscript_compat)? 0 : -1;
++ else
++ length = md->offset_vector[offset+1] - md->offset_vector[offset];
+
+- /* Set up for repetition, or handle the non-repeated case */
++ /* Set up for repetition, or handle the non-repeated case */
+
+- switch (*ecode)
+- {
+- case OP_CRSTAR:
+- case OP_CRMINSTAR:
+- case OP_CRPLUS:
+- case OP_CRMINPLUS:
+- case OP_CRQUERY:
+- case OP_CRMINQUERY:
+- c = *ecode++ - OP_CRSTAR;
+- minimize = (c & 1) != 0;
+- min = rep_min[c]; /* Pick up values from tables; */
+- max = rep_max[c]; /* zero for max => infinity */
+- if (max == 0) max = INT_MAX;
+- break;
++ switch (*ecode)
++ {
++ case OP_CRSTAR:
++ case OP_CRMINSTAR:
++ case OP_CRPLUS:
++ case OP_CRMINPLUS:
++ case OP_CRQUERY:
++ case OP_CRMINQUERY:
++ c = *ecode++ - OP_CRSTAR;
++ minimize = (c & 1) != 0;
++ min = rep_min[c]; /* Pick up values from tables; */
++ max = rep_max[c]; /* zero for max => infinity */
++ if (max == 0) max = INT_MAX;
++ break;
+
+- case OP_CRRANGE:
+- case OP_CRMINRANGE:
+- minimize = (*ecode == OP_CRMINRANGE);
+- min = GET2(ecode, 1);
+- max = GET2(ecode, 3);
+- if (max == 0) max = INT_MAX;
+- ecode += 5;
+- break;
++ case OP_CRRANGE:
++ case OP_CRMINRANGE:
++ minimize = (*ecode == OP_CRMINRANGE);
++ min = GET2(ecode, 1);
++ max = GET2(ecode, 3);
++ if (max == 0) max = INT_MAX;
++ ecode += 5;
++ break;
+
+- default: /* No repeat follows */
+- if (!match_ref(offset, eptr, length, md, ims))
+- {
+- CHECK_PARTIAL();
+- MRRETURN(MATCH_NOMATCH);
+- }
+- eptr += length;
+- continue; /* With the main loop */
++ default: /* No repeat follows */
++ if ((length = match_ref(offset, eptr, length, md, ims)) < 0)
++ {
++ CHECK_PARTIAL();
++ MRRETURN(MATCH_NOMATCH);
+ }
++ eptr += length;
++ continue; /* With the main loop */
++ }
+
+- /* If the length of the reference is zero, just continue with the
+- main loop. */
++ /* Handle repeated back references. If the length of the reference is
++ zero, just continue with the main loop. */
+
+- if (length == 0) continue;
++ if (length == 0) continue;
+
+- /* First, ensure the minimum number of matches are present. We get back
+- the length of the reference string explicitly rather than passing the
+- address of eptr, so that eptr can be a register variable. */
++ /* First, ensure the minimum number of matches are present. We get back
++ the length of the reference string explicitly rather than passing the
++ address of eptr, so that eptr can be a register variable. */
+
+- for (i = 1; i <= min; i++)
++ for (i = 1; i <= min; i++)
++ {
++ int slength;
++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
+ {
+- if (!match_ref(offset, eptr, length, md, ims))
+- {
+- CHECK_PARTIAL();
+- MRRETURN(MATCH_NOMATCH);
+- }
+- eptr += length;
++ CHECK_PARTIAL();
++ MRRETURN(MATCH_NOMATCH);
+ }
++ eptr += slength;
++ }
+
+- /* If min = max, continue at the same level without recursion.
+- They are not both allowed to be zero. */
++ /* If min = max, continue at the same level without recursion.
++ They are not both allowed to be zero. */
+
+- if (min == max) continue;
++ if (min == max) continue;
+
+- /* If minimizing, keep trying and advancing the pointer */
++ /* If minimizing, keep trying and advancing the pointer */
+
+- if (minimize)
++ if (minimize)
++ {
++ for (fi = min;; fi++)
+ {
+- for (fi = min;; fi++)
++ int slength;
++ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
++ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
++ if (fi >= max) MRRETURN(MATCH_NOMATCH);
++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
+ {
+- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
+- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+- if (fi >= max) MRRETURN(MATCH_NOMATCH);
+- if (!match_ref(offset, eptr, length, md, ims))
+- {
+- CHECK_PARTIAL();
+- MRRETURN(MATCH_NOMATCH);
+- }
+- eptr += length;
++ CHECK_PARTIAL();
++ MRRETURN(MATCH_NOMATCH);
+ }
+- /* Control never gets here */
++ eptr += slength;
+ }
++ /* Control never gets here */
++ }
+
+- /* If maximizing, find the longest string and work backwards */
++ /* If maximizing, find the longest string and work backwards */
+
+- else
++ else
++ {
++ pp = eptr;
++ for (i = min; i < max; i++)
+ {
+- pp = eptr;
+- for (i = min; i < max; i++)
++ int slength;
++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
+ {
+- if (!match_ref(offset, eptr, length, md, ims))
+- {
+- CHECK_PARTIAL();
+- break;
+- }
+- eptr += length;
++ CHECK_PARTIAL();
++ break;
+ }
+- while (eptr >= pp)
+- {
+- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
+- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+- eptr -= length;
+- }
+- MRRETURN(MATCH_NOMATCH);
++ eptr += slength;
+ }
++ while (eptr >= pp)
++ {
++ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
++ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
++ eptr -= length;
++ }
++ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+Index: testdata/testinput1
+===================================================================
+--- testdata/testinput1 (revision 596)
++++ testdata/testinput1 (revision 597)
+@@ -4079,4 +4079,10 @@
+ /^\c/
+ ?
+
++/(abc)\1/i
++ abc
++
++/(abc)\1/
++ abc
++
+ /-- End of testinput1 --/
+Index: testdata/testoutput1
+===================================================================
+--- testdata/testoutput1 (revision 596)
++++ testdata/testoutput1 (revision 597)
+@@ -6666,4 +6666,12 @@
+ ?
+ 0: ?
+
++/(abc)\1/i
++ abc
++No match
++
++/(abc)\1/
++ abc
++No match
++
+ /-- End of testinput1 --/
+Index: testdata/testinput4
+===================================================================
+--- testdata/testinput4 (revision 596)
++++ testdata/testinput4 (revision 597)
+@@ -644,4 +644,10 @@
+ /A*/g8
+ AAB\x{123}BAA
+
++/(abc)\1/8i
++ abc
++
++/(abc)\1/8
++ abc
++
+ /-- End of testinput4 --/
+Index: testdata/testoutput4
+===================================================================
+--- testdata/testoutput4 (revision 596)
++++ testdata/testoutput4 (revision 597)
+@@ -1128,4 +1128,12 @@
+ 0: AA
+ 0:
+
++/(abc)\1/8i
++ abc
++No match
++
++/(abc)\1/8
++ abc
++No match
++
+ /-- End of testinput4 --/
+Index: pcre_exec.c
+===================================================================
+--- pcre_exec.c (revision 596)
++++ pcre_exec.c (revision 597)
+@@ -193,6 +193,7 @@
+ while (p < endptr)
+ {
+ int c, d;
++ if (eptr >= md->end_subject) return -1;
+ GETCHARINC(c, eptr);
+ GETCHARINC(d, p);
+ if (c != d && c != UCD_OTHERCASE(d)) return -1;
+@@ -204,16 +205,21 @@
+
+ /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
+ is no UCP support. */
+-
+- while (length-- > 0)
+- { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
++ {
++ if (eptr + length > md->end_subject) return -1;
++ while (length-- > 0)
++ { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
++ }
+ }
+
+ /* In the caseful case, we can just compare the bytes, whether or not we
+ are in UTF-8 mode. */
+
+ else
+- { while (length-- > 0) if (*p++ != *eptr++) return -1; }
++ {
++ if (eptr + length > md->end_subject) return -1;
++ while (length-- > 0) if (*p++ != *eptr++) return -1;
++ }
+
+ return eptr - eptr_start;
+ }
diff --git a/pcre.spec b/pcre.spec
index 57b1344..5e6b42e 100644
--- a/pcre.spec
+++ b/pcre.spec
@@ -1,6 +1,6 @@
Name: pcre
Version: 8.12
-Release: 3%{?dist}
+Release: 4%{?dist}
Summary: Perl-compatible regular expression library
Group: System Environment/Libraries
License: BSD
@@ -11,6 +11,8 @@ Patch0: pcre-8.10-multilib.patch
Patch1: pcre-8.12-manual_typos.patch
# Refused by upstream, bug #675477
Patch2: pcre-8.12-refused_spelling_terminated.patch
+# In upstream, bug #702623
+Patch3: pcre-8.12-caseless_reference.patch
# New libtool to get rid of rpath
BuildRequires: autoconf, automake, libtool
@@ -44,6 +46,7 @@ Library for static linking for %{name}.
libtoolize --copy --force && autoreconf
%patch1 -p0 -b .manual_typos
%patch2 -p1 -b .terminated_typos
+%patch3 -p0 -b .caseless_reference
# One contributor's name is non-UTF-8
for F in ChangeLog; do
iconv -f latin1 -t utf8 "$F" >"${F}.utf8"
@@ -103,6 +106,10 @@ make check
%doc COPYING LICENCE
%changelog
+* Mon May 09 2011 Petr Pisar <ppisar at redhat.com> - 8.12-4
+- Fix caseless reference matching in UTF-8 mode when the upper/lower case
+ characters have different lengths (bug #702623)
+
* Mon May 09 2011 Petr Pisar <ppisar at redhat.com> - 8.12-3
- Fix typos in manual pages (bugs #675476, #675477)
- Clean spec file up
More information about the scm-commits
mailing list