[coreutils] various fixes in tr case conversion

Thu Sep 30 15:43:25 UTC 2010

commit 0a5116219c371d29b78c0ad59a8359d47c57f564
Author: Ondřej Vašík <ovasik at redhat.com>
Date:   Thu Sep 30 17:43:34 2010 +0200

    various fixes in tr case conversion

 coreutils-8.5-trcaseconversion.patch |  431 ++++++++++++++++++++++++++++++++++
 coreutils.spec                       |    9 +-
 2 files changed, 439 insertions(+), 1 deletions(-)
---

diff --git a/coreutils-8.5-trcaseconversion.patch b/coreutils-8.5-trcaseconversion.patch
new file mode 100644
index 0000000..6cd66ba
--- /dev/null
+++ b/coreutils-8.5-trcaseconversion.patch
@@ -0,0 +1,431 @@
+From: Pádraig Brady <P at draigBrady.com>
+Date: Mon, 27 Sep 2010 06:16:44 +0000 (+0100)
+Subject: tr: fix various issues with case conversion
+X-Git-Url: http://git.savannah.gnu.org/gitweb/?p=coreutils.git;a=commitdiff_plain;h=3f48829c;hp=704eedab034e24814067c535d3577f165c9a8b68
+
+tr: fix various issues with case conversion
+
+This valid translation spec aborted:
+  LC_ALL=en_US.iso-8859-1 tr '[:upper:]- ' '[:lower:]_'
+This invalid translation spec aborted:
+  LC_ALL=en_US.iso-8859-1 tr '[:upper:] '  '[:lower:]'
+This was caused by commit 6efd1046, 05-01-2008,
+"Avoid tr case-conversion failure in some locales"
+
+This misaligned conversion spec was allowed:
+  LC_ALL=C tr 'A-Y[:lower:]' 'a-z[:upper:]'
+This was caused by commit af5d0c36, 21-10-2007,
+"tr: do not reject an unmatched [:lower:] or [:upper:] in SET1"
+
+This misaligned spec was allowed by extending the class:
+  LC_ALL=C tr '[:upper:] ' '[:lower:]'
+
+* src/tr.c (validate_case_classes): A new function to check
+alignment of case conversion classes.  Also it adjusts the
+length of the sets so that locales with different numbers of
+upper and lower case characters, don't cause issues.
+(string2_extend): Disallow extending the case conversion
+class as in the above example.  That is locale dependent
+and most likely not what the user wants.
+(validate): Do the simple test for "restricted" char classes
+earlier, so we don't redundantly do more expensive validation.
+(main): Remove the case class validation, and simplify.
+* tests/misc/tr-case-class: A new test to test the various
+alignment and locale issues, associated with case conversion.
+* tests/misc/tr: Move case conversion tests to new tr-case-class.
+* tests/Makefile.am: Reference the new test.
+---
+
+diff --git a/src/tr.c b/src/tr.c
+index a5b6810..479d3d3 100644
+--- a/src/tr.c
++++ b/src/tr.c
+@@ -1177,6 +1177,78 @@ card_of_complement (struct Spec_list *s)
+   return cardinality;
+ }
+ 
++/* Discard the lengths associated with a case conversion,
++   as using the actual number of upper or lower case characters
++   is problematic when they don't match in some locales.
++   Also ensure the case conversion classes in string2 are
++   aligned correctly with those in string1.
++   Note POSIX says the behavior of `tr "[:upper:]" "[:upper:]"'
++   is undefined.  Therefore we allow it (unlike Solaris)
++   and treat it as a no-op.  */
++
++static void
++validate_case_classes (struct Spec_list *s1, struct Spec_list *s2)
++{
++  size_t n_upper = 0;
++  size_t n_lower = 0;
++  unsigned int i;
++  int c1 = 0;
++  int c2 = 0;
++  count old_s1_len = s1->length;
++  count old_s2_len = s2->length;
++  struct List_element *s1_tail = s1->tail;
++  struct List_element *s2_tail = s2->tail;
++  bool s1_new_element = true;
++  bool s2_new_element = true;
++
++  if (!s2->has_char_class)
++    return;
++
++  for (i = 0; i < N_CHARS; i++)
++    {
++      if (isupper (i))
++        n_upper++;
++      if (islower (i))
++        n_lower++;
++    }
++
++  s1->state = BEGIN_STATE;
++  s2->state = BEGIN_STATE;
++
++  while (c1 != -1 && c2 != -1)
++    {
++      enum Upper_Lower_class class_s1, class_s2;
++
++      c1 = get_next (s1, &class_s1);
++      c2 = get_next (s2, &class_s2);
++
++      /* If c2 transitions to a new case class, then
++         c1 must also transition at the same time.  */
++      if (s2_new_element && class_s2 != UL_NONE
++          && !(s1_new_element && class_s1 != UL_NONE))
++        error (EXIT_FAILURE, 0,
++               _("misaligned [:upper:] and/or [:lower:] construct"));
++
++      /* If case converting, quickly skip over the elements.  */
++      if (class_s2 != UL_NONE)
++        {
++          skip_construct (s1);
++          skip_construct (s2);
++          /* Discount insignificant/problematic lengths.  */
++          s1->length -= (class_s1 == UL_UPPER ? n_upper : n_lower) - 1;
++          s2->length -= (class_s2 == UL_UPPER ? n_upper : n_lower) - 1;
++        }
++
++      s1_new_element = s1->state == NEW_ELEMENT; /* Next element is new.  */
++      s2_new_element = s2->state == NEW_ELEMENT; /* Next element is new.  */
++    }
++
++  assert (old_s1_len >= s1->length && old_s2_len >= s2->length);
++
++  s1->tail = s1_tail;
++  s2->tail = s2_tail;
++}
++
+ /* Gather statistics about the spec-list S in preparation for the tests
+    in validate that determine the consistency of the specs.  This function
+    is called at most twice; once for string1, and again for any string2.
+@@ -1318,20 +1390,14 @@ parse_str (char const *s, struct Spec_list *spec_list)
+    Upon successful completion, S2->length is set to S1->length.  The only
+    way this function can fail to make S2 as long as S1 is when S2 has
+    zero-length, since in that case, there is no last character to repeat.
+-   So S2->length is required to be at least 1.
++   So S2->length is required to be at least 1.  */
+ 
+-   Providing this functionality allows the user to do some pretty
+-   non-BSD (and non-portable) things:  For example, the command
+-       tr -cs '[:upper:]0-9' '[:lower:]'
+-   is almost guaranteed to give results that depend on your collating
+-   sequence.  */
+ 
+ static void
+ string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
+ {
+   struct List_element *p;
+   unsigned char char_to_repeat;
+-  int i;
+ 
+   assert (translating);
+   assert (s1->length > s2->length);
+@@ -1347,11 +1413,13 @@ string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
+       char_to_repeat = p->u.range.last_char;
+       break;
+     case RE_CHAR_CLASS:
+-      for (i = N_CHARS - 1; i >= 0; i--)
+-        if (is_char_class_member (p->u.char_class, i))
+-          break;
+-      assert (i >= 0);
+-      char_to_repeat = i;
++      /* Note BSD allows extending of classes in string2.  For example:
++           tr '[:upper:]0-9' '[:lower:]'
++         That's not portable however, contradicts POSIX and is dependent
++         on your collating sequence.  */
++      error (EXIT_FAILURE, 0,
++             _("when translating with string1 longer than string2,\n\
++the latter string must not end with a character class"));
+       break;
+ 
+     case RE_REPEATED_CHAR:
+@@ -1431,6 +1499,15 @@ validate (struct Spec_list *s1, struct Spec_list *s2)
+ when translating"));
+             }
+ 
++          if (s2->has_restricted_char_class)
++            {
++              error (EXIT_FAILURE, 0,
++                     _("when translating, the only character classes that may \
++appear in\nstring2 are `upper' and `lower'"));
++            }
++
++          validate_case_classes (s1, s2);
++
+           if (s1->length > s2->length)
+             {
+               if (!truncate_set1)
+@@ -1452,13 +1529,6 @@ when translating"));
+                      _("when translating with complemented character classes,\
+ \nstring2 must map all characters in the domain to one"));
+             }
+-
+-          if (s2->has_restricted_char_class)
+-            {
+-              error (EXIT_FAILURE, 0,
+-                     _("when translating, the only character classes that may \
+-appear in\nstring2 are `upper' and `lower'"));
+-            }
+         }
+       else
+         /* Not translating.  */
+@@ -1812,7 +1882,6 @@ main (int argc, char **argv)
+         {
+           int c1, c2;
+           int i;
+-          bool case_convert = false;
+           enum Upper_Lower_class class_s1;
+           enum Upper_Lower_class class_s2;
+ 
+@@ -1822,47 +1891,21 @@ main (int argc, char **argv)
+           s2->state = BEGIN_STATE;
+           while (true)
+             {
+-              /* When the previous pair identified case-converting classes,
+-                 advance S1 and S2 so that each points to the following
+-                 construct.  */
+-              if (case_convert)
+-                {
+-                  skip_construct (s1);
+-                  skip_construct (s2);
+-                  case_convert = false;
+-                }
+-
+               c1 = get_next (s1, &class_s1);
+               c2 = get_next (s2, &class_s2);
+ 
+-              /* When translating and there is an [:upper:] or [:lower:]
+-                 class in SET2, then there must be a corresponding [:lower:]
+-                 or [:upper:] class in SET1.  */
+-              if (class_s1 == UL_NONE
+-                  && (class_s2 == UL_LOWER || class_s2 == UL_UPPER))
+-                error (EXIT_FAILURE, 0,
+-                       _("misaligned [:upper:] and/or [:lower:] construct"));
+-
+               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
+                 {
+-                  case_convert = true;
+                   for (i = 0; i < N_CHARS; i++)
+                     if (islower (i))
+                       xlate[i] = toupper (i);
+                 }
+               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
+                 {
+-                  case_convert = true;
+                   for (i = 0; i < N_CHARS; i++)
+                     if (isupper (i))
+                       xlate[i] = tolower (i);
+                 }
+-              else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
+-                       || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
+-                {
+-                  /* POSIX says the behavior of `tr "[:upper:]" "[:upper:]"'
+-                     is undefined.  Treat it as a no-op.  */
+-                }
+               else
+                 {
+                   /* The following should have been checked by validate...  */
+@@ -1870,6 +1913,13 @@ main (int argc, char **argv)
+                     break;
+                   xlate[c1] = c2;
+                 }
++
++              /* When case-converting, skip the elements as an optimization.  */
++              if (class_s2 != UL_NONE)
++                {
++                  skip_construct (s1);
++                  skip_construct (s2);
++                }
+             }
+           assert (c1 == -1 || truncate_set1);
+         }
+diff --git a/tests/Makefile.am b/tests/Makefile.am
+index 5619d0b..3236637 100644
+--- a/tests/Makefile.am
++++ b/tests/Makefile.am
+@@ -260,6 +260,7 @@ TESTS =						\
+   misc/timeout					\
+   misc/timeout-parameters			\
+   misc/tr					\
++  misc/tr-case-class				\
+   misc/truncate-dangling-symlink		\
+   misc/truncate-dir-fail			\
+   misc/truncate-fail-diag			\
+diff --git a/tests/misc/tr b/tests/misc/tr
+index ca7a960..00cd8e6 100755
+--- a/tests/misc/tr
++++ b/tests/misc/tr
+@@ -155,34 +155,8 @@ my @Tests =
+ 
+   # Up to coreutils-6.9, this would provoke a failed assertion.
+   ['no-abort-1', qw(-c a '[b*256]'), {IN=>'abc'}, {OUT=>'abb'}],
+-
+-  # Up to coreutils-6.9, tr rejected an unmatched [:lower:] or [:upper:] in SET1.
+-  ['s1-lower', qw('[:lower:]' '[.*]'),
+-   {IN=>'#$%123abcABC'}, {OUT=>'#$%123...ABC'}],
+-  ['s1-upper', qw('[:upper:]' '[.*]'),
+-   {IN=>'#$%123abcABC'}, {OUT=>'#$%123abc...'}],
+-
+-  # Up to coreutils-6.9.91, this would fail with the diagnostic:
+-  # tr: misaligned [:upper:] and/or [:lower:] construct
+-  # with LC_CTYPE=en_US.ISO-8859-1.
+-  ['tolower-F', qw('[:upper:]' '[:lower:]'), {IN=>'A'}, {OUT=>'a'}],
+-
+-  # When doing a case-converting translation with something after the
+-  # [:upper:] and [:lower:] elements, ensure that tr honors the following byte.
+-  ['upcase-xtra', qw('[:lower:].' '[:upper:]x'), {IN=>'abc.'}, {OUT=>'ABCx'}],
+-  ['dncase-xtra', qw('[:upper:].' '[:lower:]x'), {IN=>'ABC.'}, {OUT=>'abcx'}],
+ );
+ 
+-# Set LC_CTYPE=en_US.ISO-8859-1 in the environment of the tolower-F test.
+-foreach my $t (@Tests)
+-  {
+-    if ($t->[0] eq 'tolower-F')
+-      {
+-        push @$t, {ENV=>'LC_CTYPE=en_US.ISO-8859-1'};
+-        last;
+-      }
+-  }
+-
+ @Tests = triple_test \@Tests;
+ 
+ # tr takes its input only from stdin, not from a file argument, so
+diff --git a/tests/misc/tr-case-class b/tests/misc/tr-case-class
+new file mode 100755
+index 0000000..d81c676
+--- /dev/null
++++ b/tests/misc/tr-case-class
+@@ -0,0 +1,112 @@
++#!/bin/sh
++# Test case conversion classes
++
++# Copyright (C) 2010 Free Software Foundation, Inc.
++
++# This program is free software: you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation, either version 3 of the License, or
++# (at your option) any later version.
++
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++
++# You should have received a copy of the GNU General Public License
++# along with this program.  If not, see <http://www.gnu.org/licenses/>.
++
++. $srcdir/test-lib.sh
++
++# Ensure we support translation of case classes with extension
++echo '01234567899999999999999999' > exp
++echo 'abcdefghijklmnopqrstuvwxyz' |
++tr '[:lower:]' '0-9' > out || fail=1
++compare out exp || fail=1
++echo 'abcdefghijklmnopqrstuvwxyz' |
++tr '[:lower:][:lower:]' '[:upper:]0-9' > out || fail=1
++compare out exp || fail=1
++
++# Validate the alignment of case classes
++tr 'A-Z[:lower:]' 'a-y[:upper:]' < /dev/null && fail=1
++tr '[:upper:][:lower:]' 'a-y[:upper:]' < /dev/null && fail=1
++tr 'A-Y[:lower:]' 'a-z[:upper:]' < /dev/null && fail=1
++tr 'A-Z[:lower:]' '[:lower:][:upper:]' < /dev/null && fail=1
++tr 'A-Z[:lower:]' '[:lower:]A-Z' < /dev/null && fail=1
++tr '[:upper:][:lower:]' 'a-z[:upper:]' < /dev/null || fail=1
++tr '[:upper:][:lower:]' '[:upper:]a-z' < /dev/null || fail=1
++
++# Before coreutils 8.6 the trailing space in string1
++# caused the case class in string2 to be extended.
++# However that was not portable, dependent on locale
++# and in contravention of POSIX.
++tr '[:upper:] ' '[:lower:]' < /dev/null 2>out && fail=1
++echo 'tr: when translating with string1 longer than string2,
++the latter string must not end with a character class' > exp
++compare out exp || fail=1
++
++# Up to coreutils-6.9, tr rejected an unmatched [:lower:] or [:upper:] in SET1.
++echo '#$%123abcABC' | tr '[:lower:]' '[.*]' > out || fail=1
++echo '#$%123...ABC' > exp
++compare out exp || fail=1
++echo '#$%123abcABC' | tr '[:upper:]' '[.*]' > out || fail=1
++echo '#$%123abc...' > exp
++compare out exp || fail=1
++
++# When doing a case-converting translation with something after the
++# [:upper:] and [:lower:] elements, ensure that tr honors the following byte.
++echo 'abc.' | tr '[:lower:].' '[:upper:]x' > out || fail=1
++echo 'ABCx' > exp
++compare out exp || fail=1
++
++# Before coreutils 8.6 the disparate number of upper and lower
++# characters in some locales, triggered abort()s and invalid behavior
++export LC_ALL=en_US.ISO-8859-1
++
++if test "$(locale charmap 2>/dev/null)" = ISO-8859-1; then
++  # Up to coreutils-6.9.91, this would fail with the diagnostic:
++  # tr: misaligned [:upper:] and/or [:lower:] construct
++  # with LC_CTYPE=en_US.ISO-8859-1.
++  tr '[:upper:]' '[:lower:]' < /dev/null || fail=1
++
++  tr '[:upper:] ' '[:lower:]' < /dev/null 2>out && fail=1
++  echo 'tr: when translating with string1 longer than string2,
++the latter string must not end with a character class' > exp
++  compare out exp || fail=1
++
++  # Ensure when there are a different number of elements
++  # in each string, we validate the case mapping correctly
++  echo 'abc.xyz' |
++  tr 'ab[:lower:]' '0-1[:upper:]' > out || fail=1
++  echo 'ABC.XYZ' > exp
++  compare out exp || fail=1
++
++  # Ensure we extend string2 appropriately
++  echo 'ABC- XYZ' |
++  tr '[:upper:]- ' '[:lower:]_' > out || fail=1
++  echo 'abc__xyz' > exp
++  compare out exp || fail=1
++
++  # Ensure the size of the case classes are accounted
++  # for as a unit.
++  echo 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
++  tr '[:upper:]A-B' '[:lower:]0' >out || fail=1
++  echo '00cdefghijklmnopqrstuvwxyz' > exp
++  compare out exp || fail=1
++
++  # Ensure the size of the case classes are accounted
++  # for as a unit.
++  echo 'a' |
++  tr -t '[:lower:]a' '[:upper:]0' >out || fail=1
++  echo '0' > exp
++  compare out exp || fail=1
++
++  # Ensure the size of the case classes are accounted
++  # for as a unit.
++  echo 'a' |
++  tr -t '[:lower:][:lower:]a' '[:lower:][:upper:]0' >out || fail=1
++  echo '0' > exp
++  compare out exp || fail=1
++fi
++
++Exit $fail
diff --git a/coreutils.spec b/coreutils.spec
index cc92fa7..af90919 100644
--- a/coreutils.spec
+++ b/coreutils.spec
@@ -1,7 +1,7 @@
 Summary: A set of basic GNU tools commonly used in shell scripts
 Name:    coreutils
 Version: 8.5
-Release: 9%{?dist}
+Release: 10%{?dist}
 License: GPLv3+
 Group:   System Environment/Base
 Url:     http://www.gnu.org/software/coreutils/
@@ -20,6 +20,8 @@ Source203:  coreutils-runuser-l.pamd
 # From upstream
 #fix double free error in tac (reported in debian bug #594666)
 Patch1: coreutils-8.5-tac-doublefree.patch
+#fix various case conversion issues in tr(#611274)
+Patch2: coreutils-8.5-trcaseconversion.patch
 
 # Our patches
 #general patch to workaround koji build system issues
@@ -126,6 +128,7 @@ Libraries for coreutils package.
 
 # From upstream
 %patch1 -p1 -b .doublefree
+%patch2 -p1 -b .caseconvert
 
 # Our patches
 %patch100 -p1 -b .configure
@@ -158,6 +161,7 @@ Libraries for coreutils package.
 %patch951 -p1 -b .selinuxman
 
 chmod a+x tests/misc/sort-mb-tests tests/df/direct
+chmod a+x tests/misc/tr-case-class
 
 #fix typos/mistakes in localized documentation(#439410, #440056)
 find ./po/ -name "*.p*" | xargs \
@@ -344,6 +348,9 @@ fi
 %{_libdir}/coreutils
 
 %changelog
+* Thu Sep 30 2010 Ondrej Vasik <ovasik at redhat.com> - 8.5-10
+- various fixes for case conversion in tr(#611274)
+
 * Wed Sep 29 2010 jkeating - 8.5-9
 - Rebuilt for gcc bug 634757