Author: nhosoi
Update of /cvs/dirsec/ldapserver/ldap/servers/plugins/syntaxes
In directory cvs1.fedora.phx.redhat.com:/tmp/cvs-serv27683
Modified Files:
phonetic.c
Log Message:
Resolves: #483668
Summary: Syntax plugin (phonetic): "Sounds like" does not support Western
European characters
Description: added a support for Latin-1 characters (UNICODE:00C0 - 00FF)
Index: phonetic.c
===================================================================
RCS file: /cvs/dirsec/ldapserver/ldap/servers/plugins/syntaxes/phonetic.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- phonetic.c 12 Jan 2009 19:18:38 -0000 1.6
+++ phonetic.c 3 Feb 2009 19:15:26 -0000 1.7
@@ -230,268 +230,440 @@
/* N O P Q R S T U V W X Y Z */
/* Macros to access character coding array */
-#define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1)
/* AEIOU */
+#define vowel(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] &
1) || /* AEIOU */ \
+ (((*(x)==0xC3) && (*((x)+1))) ? ((0x80<=*((x)+1) &&
*((x)+1)<0x87) || \
+ (0x88<=*((x)+1) && *((x)+1)<0x90) || (0x92<=*((x)+1) &&
*((x)+1)<0x97) || \
+ (0x98<=*((x)+1) && *((x)+1)<0x9D) || (0xA0<=*((x)+1) &&
*((x)+1)<0xA7) || \
+ (0xA8<=*((x)+1) && *((x)+1)<0xB0) || (0xB2<=*((x)+1) &&
*((x)+1)<0xB7) || \
+ (0xB8<=*((x)+1) && *((x)+1)<0xBD)) : 0 ) /* Latin-1 characters */ )
+/*
+ case 0xC3:
+*/
#define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2)
/* FJLMNR */
#define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4)
/* CGPST */
-#define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8)
/* EIY */
+#define frontv(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] &
8) || /* EIY */ \
+ (((*(x)==0xC3) && (*((x)+1))) ? ((0x88<=*((x)+1) &&
*((x)+1)<0x90) || \
+ (0xA8<=*((x)+1) && *((x)+1)<0xB0)) : 0 ) /* Latin-1 E/I */ )
#define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] &
16) /* BDH */
char *
phonetic( char *Word )
{
- char *n, *n_start, *n_end; /* pointers to string */
- char *metaph_end; /* pointers to metaph */
- char ntrans[42]; /* word with uppercase letters */
- int KSflag; /* state flag for X -> KS */
- char buf[MAXPHONEMELEN + 2];
- char *Metaph;
-
- /*
- * Copy Word to internal buffer, dropping non-alphabetic characters
- * and converting to upper case
- */
- n = ntrans + 4; n_end = ntrans + 35;
- while (!iswordbreak( Word ) && n < n_end) {
- if (isascii(*Word)) {
- if (isalpha(*Word)) {
- *n++ = TOUPPER(*Word);
- }
- ++Word;
- } else {
- auto const size_t len = LDAP_UTF8COPY(n, Word);
- n += len; Word += len;
+ unsigned char *n, *n_start, *n_end; /* pointers to string */
+ char *metaph_end; /* pointers to metaph */
+ char ntrans[42]; /* word with uppercase letters */
+ int KSflag; /* state flag for X -> KS */
+ char buf[MAXPHONEMELEN + 2];
+ char *Metaph;
+
+ /*
+ * Copy Word to internal buffer, dropping non-alphabetic characters
+ * and converting to upper case
+ */
+ n = ntrans + 4; n_end = ntrans + 35;
+ while (!iswordbreak( Word ) && n < n_end) {
+ if (isascii(*Word)) {
+ if (isalpha(*Word)) {
+ *n++ = TOUPPER(*Word);
}
+ ++Word;
+ } else {
+ auto const size_t len = LDAP_UTF8COPY(n, Word);
+ n += len; Word += len;
}
- Metaph = buf;
- *Metaph = '\0';
- if (n == ntrans + 4) {
- return( slapi_ch_strdup( buf ) ); /* Return if null */
+ }
+ Metaph = buf;
+ *Metaph = '\0';
+ if (n == ntrans + 4) {
+ return( slapi_ch_strdup( buf ) ); /* Return if null */
+ }
+ n_end = n; /* Set n_end to end of string */
+
+ /* ntrans[0] will always be == 0 */
+ ntrans[0] = '\0';
+ ntrans[1] = '\0';
+ ntrans[2] = '\0';
+ ntrans[3] = '\0';
+ *n++ = 0;
+ *n++ = 0;
+ *n++ = 0;
+ *n = 0; /* Pad with nulls */
+ n = ntrans + 4; /* Assign pointer to start */
+
+ /* Check for PN, KN, GN, AE, WR, WH, and X at start */
+ switch (*n) {
+ case 'P':
+ case 'K':
+ case 'G':
+ /* 'PN', 'KN', 'GN' becomes 'N' */
+ if (*(n + 1) == 'N')
+ *n++ = 0;
+ break;
+ case 'A':
+ /* 'AE' becomes 'E' */
+ if (*(n + 1) == 'E')
+ *n++ = 0;
+ break;
+ case 'W':
+ /* 'WR' becomes 'R', and 'WH' to 'H' */
+ if (*(n + 1) == 'R')
+ *n++ = 0;
+ else if (*(n + 1) == 'H') {
+ *n++ = 0;
}
- n_end = n; /* Set n_end to end of string */
+ break;
+ case 'X':
+ /* 'X' becomes 'S' */
+ *n = 'S';
+ break;
+ case 0xC3:
+ switch (*(n+1)) {
+ case 0x80:
+ case 0x81:
+ case 0x82:
+ case 0x83:
+ case 0x84:
+ case 0x85:
+ *n++ = 0;
+ *n = 'A';
+ break;
+ case 0x87:
+ *n++ = 0;
+ *n = 'C';
+ break;
+ case 0x86:
+ case 0x88:
+ case 0x89:
+ case 0x8A:
+ case 0x8B:
+ *n++ = 0;
+ *n = 'E';
+ break;
+ case 0x8C:
+ case 0x8D:
+ case 0x8E:
+ case 0x8F:
+ *n++ = 0;
+ *n = 'I';
+ break;
+ case 0x90: /* eth: TH */
+ *n++ = 0;
+ *n = '0';
+ break;
+ case 0x91:
+ *n++ = 0;
+ *n = 'N';
+ break;
+ case 0x92:
+ case 0x93:
+ case 0x94:
+ case 0x95:
+ case 0x96:
+ case 0x98:
+ *n++ = 0;
+ *n = 'O';
+ break;
+ case 0x99:
+ case 0x9A:
+ case 0x9B:
+ case 0x9C:
+ *n++ = 0;
+ *n = 'U';
+ break;
+ case 0x9D:
+ *n++ = 0;
+ *n = 'Y';
+ break;
+ case 0x9E:
+ *n++ = 0;
+ *n = '0'; /* thorn: TH */
+ break;
+ case 0x9F:
+ *n++ = 0;
+ *n = 's';
+ break;
+ case 0xA0:
+ case 0xA1:
+ case 0xA2:
+ case 0xA3:
+ case 0xA4:
+ case 0xA5:
+ *n++ = 0;
+ *n = 'a';
+ break;
+ case 0xA6:
+ *n++ = 0;
+ *n = 'e';
+ break;
+ case 0xA7:
+ *n++ = 0;
+ *n = 'c';
+ break;
+ case 0xA8:
+ case 0xA9:
+ case 0xAA:
+ case 0xAB:
+ *n++ = 0;
+ *n = 'e';
+ break;
+ case 0xAC:
+ case 0xAD:
+ case 0xAE:
+ case 0xAF:
+ *n++ = 0;
+ *n = 'i';
+ break;
+ case 0xB0:
+ *n++ = 0;
+ *n = '0'; /* eth: th */
+ break;
+ case 0xB1:
+ *n++ = 0;
+ *n = 'n';
+ break;
+ case 0xB2:
+ case 0xB3:
+ case 0xB4:
+ case 0xB5:
+ case 0xB6:
+ case 0xB8:
+ *n++ = 0;
+ *n = 'o';
+ break;
+ case 0xB9:
+ case 0xBA:
+ case 0xBB:
+ case 0xBC:
+ *n++ = 0;
+ *n = 'u';
+ break;
+ case 0xBD:
+ case 0xBF:
+ *n++ = 0;
+ *n = 'y';
+ break;
+ case 0xBE:
+ *n++ = 0;
+ *n = '0'; /* thorn: th */
+ break;
+ }
+ break;
+ }
- /* ntrans[0] will always be == 0 */
- ntrans[0] = '\0';
- ntrans[1] = '\0';
- ntrans[2] = '\0';
- ntrans[3] = '\0';
- *n++ = 0;
- *n++ = 0;
- *n++ = 0;
- *n = 0; /* Pad with nulls */
- n = ntrans + 4; /* Assign pointer to start */
-
- /* Check for PN, KN, GN, AE, WR, WH, and X at start */
- switch (*n) {
- case 'P':
- case 'K':
- case 'G':
- /* 'PN', 'KN', 'GN' becomes 'N' */
- if (*(n + 1) == 'N')
- *n++ = 0;
- break;
- case 'A':
- /* 'AE' becomes 'E' */
- if (*(n + 1) == 'E')
- *n++ = 0;
- break;
- case 'W':
- /* 'WR' becomes 'R', and 'WH' to 'H' */
- if (*(n + 1) == 'R')
- *n++ = 0;
- else if (*(n + 1) == 'H') {
- *(n + 1) = *n;
- *n++ = 0;
+ /*
+ * Now, loop step through string, stopping at end of string or when
+ * the computed 'metaph' is MAXPHONEMELEN characters long
+ */
+
+ KSflag = 0; /* state flag for KS translation */
+ for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
+ n <= n_end && Metaph < metaph_end; n++) {
+ if (KSflag) {
+ KSflag = 0;
+ *Metaph++ = 'S';
+ } else if (!isascii(*n)) {
+ switch (*n) {
+ case 0xC3:
+ if (n+1 <= n_end) {
+ switch (*(++n)) {
+ case 0x87: /* C with cedilla */
+ case 0x9F: /* ess-zed */
+ case 0xA7: /* c with cedilla */
+ *Metaph++ = 'S';
+ break;
+ case 0x90: /* eth: TH */
+ case 0x9E: /* thorn: TH */
+ case 0xB0: /* eth: th */
+ case 0xBE: /* thorn: th */
+ *Metaph++ = '0';
+ break;
+ case 0x91:
+ case 0xB1:
+ *Metaph++ = 'N';
+ break;
+ case 0x9D:
+ case 0xBD:
+ case 0xBF:
+ *Metaph++ = 'Y';
+ break;
+ default: /* skipping the rest */
+ break;
+ }
}
break;
- case 'X':
- /* 'X' becomes 'S' */
- *n = 'S';
- break;
- }
+ default:
+ *Metaph++ = *n;
+ }
+ } else {
+ /* Drop duplicates except for CC */
+ if (*(n - 1) == *n && *n != 'C')
+ continue;
+ /* Check for F J L M N R or first letter vowel */
+ if (same(*n) || (n == n_start && vowel(n))) {
+ *Metaph++ = *n;
+ } else {
+ switch (*n) {
+ case 'B':
- /*
- * Now, loop step through string, stopping at end of string or when
- * the computed 'metaph' is MAXPHONEMELEN characters long
- */
-
- KSflag = 0; /* state flag for KS translation */
- for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
- n <= n_end && Metaph < metaph_end; n++) {
- if (KSflag) {
- KSflag = 0;
- *Metaph++ = 'S';
- } else if (!isascii(*n)) {
+ /*
+ * B unless in -MB
+ */
+ if (n < (n_end - 1) && *(n - 1) != 'M') {
*Metaph++ = *n;
- } else {
- /* Drop duplicates except for CC */
- if (*(n - 1) == *n && *n != 'C')
- continue;
- /* Check for F J L M N R or first letter vowel */
- if (same(*n) || (n == n_start && vowel(*n))) {
- *Metaph++ = *n;
+ }
+ break;
+ case 'C':
+
+ /*
+ * X if in -CIA-, -CH- else S if in
+ * -CI-, -CE-, -CY- else dropped if
+ * in -SCI-, -SCE-, -SCY- else K
+ */
+ if (*(n - 1) != 'S' || !frontv((n + 1))) {
+ if (*(n + 1) == 'I' && *(n + 2) == 'A')
{
+ *Metaph++ = 'X';
+ } else if (frontv((n + 1))) {
+ *Metaph++ = 'S';
+ } else if (*(n + 1) == 'H') {
+ *Metaph++ = ((n == n_start && !vowel((n + 2)))
+ || *(n - 1) == 'S')
+ ? (char) 'K' : (char) 'X';
} else {
- switch (*n) {
- case 'B':
-
- /*
- * B unless in -MB
- */
- if (n < (n_end - 1) && *(n - 1) !=
'M') {
- *Metaph++ = *n;
- }
- break;
- case 'C':
-
- /*
- * X if in -CIA-, -CH- else S if in
- * -CI-, -CE-, -CY- else dropped if
- * in -SCI-, -SCE-, -SCY- else K
- */
- if (*(n - 1) != 'S' || !frontv(*(n + 1)))
{
- if (*(n + 1) == 'I' &&
*(n + 2) == 'A') {
- *Metaph++ = 'X';
- } else if (frontv(*(n + 1))) {
- *Metaph++ = 'S';
- } else if (*(n + 1) == 'H') {
- *Metaph++ = ((n == n_start
&& !vowel(*(n + 2)))
- || *(n - 1) == 'S')
- ? (char) 'K' : (char)
'X';
- } else {
- *Metaph++ = 'K';
- }
- }
- break;
- case 'D':
-
- /*
- * J if in DGE or DGI or DGY else T
- */
- *Metaph++ = (*(n + 1) == 'G' &&
frontv(*(n + 2)))
- ? (char) 'J' : (char) 'T';
- break;
- case 'G':
-
- /*
- * F if in -GH and not B--GH, D--GH,
- * -H--GH, -H---GH else dropped if
- * -GNED, -GN, -DGE-, -DGI-, -DGY-
- * else J if in -GE-, -GI-, -GY- and
- * not GG else K
- */
- if ((*(n + 1) != 'J' || vowel(*(n + 2)))
&&
- (*(n + 1) != 'N' || ((n + 1) <
n_end &&
- (*(n + 2) != 'E'
|| *(n + 3) != 'D'))) &&
- (*(n - 1) != 'D' || !frontv(*(n +
1))))
- *Metaph++ = (frontv(*(n + 1)) &&
- *(n + 2) != 'G') ?
(char) 'G' : (char) 'K';
- else if (*(n + 1) == 'H' &&
!noghf(*(n - 3)) &&
- *(n - 4) != 'H')
- *Metaph++ = 'F';
- break;
- case 'H':
-
- /*
- * H if before a vowel and not after
- * C, G, P, S, T else dropped
- */
- if (!varson(*(n - 1)) && (!vowel(*(n -
1)) ||
- vowel(*(n + 1))))
- *Metaph++ = 'H';
- break;
- case 'K':
-
- /*
- * dropped if after C else K
- */
- if (*(n - 1) != 'C')
- *Metaph++ = 'K';
- break;
- case 'P':
-
- /*
- * F if before H, else P
- */
- *Metaph++ = *(n + 1) == 'H' ?
- (char) 'F' : (char) 'P';
- break;
- case 'Q':
-
- /*
- * K
- */
- *Metaph++ = 'K';
- break;
- case 'S':
-
- /*
- * X in -SH-, -SIO- or -SIA- else S
- */
- *Metaph++ = (*(n + 1) == 'H' ||
- (*(n + 1) == 'I' &&
(*(n + 2) == 'O' ||
- *(n + 2) == 'A')))
- ? (char) 'X' : (char) 'S';
- break;
- case 'T':
-
- /*
- * X in -TIA- or -TIO- else 0 (zero)
- * before H else dropped if in -TCH-
- * else T
- */
- if (*(n + 1) == 'I' && (*(n + 2)
== 'O' ||
- *(n + 2) == 'A'))
- *Metaph++ = 'X';
- else if (*(n + 1) == 'H')
- *Metaph++ = '0';
- else if (*(n + 1) != 'C' || *(n + 2) !=
'H')
- *Metaph++ = 'T';
- break;
- case 'V':
-
- /*
- * F
- */
- *Metaph++ = 'F';
- break;
- case 'W':
-
- /*
- * W after a vowel, else dropped
- */
- case 'Y':
-
- /*
- * Y unless followed by a vowel
- */
- if (vowel(*(n + 1)))
- *Metaph++ = *n;
- break;
- case 'X':
-
- /*
- * KS
- */
- if (n == n_start)
- *Metaph++ = 'S';
- else {
- *Metaph++ = 'K'; /* Insert
K, then S */
- KSflag = 1;
- }
- break;
- case 'Z':
-
- /*
- * S
- */
- *Metaph++ = 'S';
- break;
- }
+ *Metaph++ = 'K';
}
+ }
+ break;
+ case 'D':
+
+ /*
+ * J if in DGE or DGI or DGY else T
+ */
+ *Metaph++ = (*(n + 1) == 'G' && frontv((n + 2)))
+ ? (char) 'J' : (char) 'T';
+ break;
+ case 'G':
+
+ /*
+ * F if in -GH and not B--GH, D--GH,
+ * -H--GH, -H---GH else dropped if
+ * -GNED, -GN, -DGE-, -DGI-, -DGY-
+ * else J if in -GE-, -GI-, -GY- and
+ * not GG else K
+ */
+ if ((*(n + 1) != 'J' || vowel((n + 2))) &&
+ (*(n + 1) != 'N' || ((n + 1) < n_end &&
+ (*(n + 2) != 'E' || *(n + 3) != 'D')))
&&
+ (*(n - 1) != 'D' || !frontv((n + 1))))
+ *Metaph++ = (frontv((n + 1)) &&
+ *(n + 2) != 'G') ? (char) 'G' : (char)
'K';
+ else if (*(n + 1) == 'H' && !noghf(*(n - 3))
&&
+ *(n - 4) != 'H')
+ *Metaph++ = 'F';
+ break;
+ case 'H':
+
+ /*
+ * H if before a vowel and not after
+ * C, G, P, S, T else dropped
+ */
+ if (!varson(*(n - 1)) && (!vowel((n - 1)) ||
+ vowel((n + 1))))
+ *Metaph++ = 'H';
+ break;
+ case 'K':
+
+ /*
+ * dropped if after C else K
+ */
+ if (*(n - 1) != 'C')
+ *Metaph++ = 'K';
+ break;
+ case 'P':
+
+ /*
+ * F if before H, else P
+ */
+ *Metaph++ = *(n + 1) == 'H' ?
+ (char) 'F' : (char) 'P';
+ break;
+ case 'Q':
+
+ /*
+ * K
+ */
+ *Metaph++ = 'K';
+ break;
+ case 'S':
+
+ /*
+ * X in -SH-, -SIO- or -SIA- else S
+ */
+ *Metaph++ = (*(n + 1) == 'H' ||
+ (*(n + 1) == 'I' && (*(n + 2) == 'O'
||
+ *(n + 2) == 'A')))
+ ? (char) 'X' : (char) 'S';
+ break;
+ case 'T':
+
+ /*
+ * X in -TIA- or -TIO- else 0 (zero)
+ * before H else dropped if in -TCH-
+ * else T
+ */
+ if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
+ *(n + 2) == 'A'))
+ *Metaph++ = 'X';
+ else if (*(n + 1) == 'H')
+ *Metaph++ = '0';
+ else if (*(n + 1) != 'C' || *(n + 2) != 'H')
+ *Metaph++ = 'T';
+ break;
+ case 'V':
+
+ /*
+ * F
+ */
+ *Metaph++ = 'F';
+ break;
+ case 'W':
+
+ /*
+ * W after a vowel, else dropped
+ */
+ case 'Y':
+
+ /*
+ * Y unless followed by a vowel
+ */
+ if (vowel((n + 1)))
+ *Metaph++ = *n;
+ break;
+ case 'X':
+
+ /*
+ * KS
+ */
+ if (n == n_start)
+ *Metaph++ = 'S';
+ else {
+ *Metaph++ = 'K'; /* Insert K, then S */
+ KSflag = 1;
+ }
+ break;
+ case 'Z':
+
+ /*
+ * S
+ */
+ *Metaph++ = 'S';
+ break;
}
+ }
}
+ }
- *Metaph = 0; /* Null terminate */
- return( slapi_ch_strdup( buf ) );
+ *Metaph = 0; /* Null terminate */
+ return( slapi_ch_strdup( buf ) );
}
#endif /* METAPHONE */