rpms/python-nltk/FC-3 python-nltk-1.4.2-maxent.patch, NONE, 1.1 .cvsignore, 1.2, 1.3 python-nltk.spec, 1.4, 1.5 sources, 1.4, 1.5

Thu Nov 9 19:14:29 UTC 2006

Author: salimma

Update of /cvs/extras/rpms/python-nltk/FC-3
In directory cvs-int.fedora.redhat.com:/tmp/cvs-serv19025

Modified Files:
	.cvsignore python-nltk.spec sources 
Added Files:
	python-nltk-1.4.2-maxent.patch 
Log Message:
- Updated version 1.4.2 for FE3, with a patch to revert maxent.py to the version that does not require numarray


python-nltk-1.4.2-maxent.patch:

--- NEW FILE python-nltk-1.4.2-maxent.patch ---
--- nltk-1.4.2/nltk/classifier/maxent.py.orig	2004-07-19 06:43:49.000000000 -0400
+++ nltk-1.4.2/nltk/classifier/maxent.py	2006-11-09 14:10:32.000000000 -0500
@@ -5,7 +5,7 @@
 # URL: <http://nltk.sf.net>
 # For license information, see LICENSE.TXT
 #
-# $Id: maxent.py,v 1.32 2004/07/19 05:34:37 edloper Exp $
+# $Id: maxent.py,v 1.31 2004/07/17 21:03:19 edloper Exp $
 
 """
 
@@ -14,33 +14,20 @@
 emperically consistant with the training data; and chooses the
 distribution with the highest entropy.  A probability distribution is
 X{emperically consistant} with a set of training data if its estimated
-frequency with which a class and a feature vector value co-occur is
-equal to the actual frequency in the data.
+frequency for each pair M{(c, f[i])} is equal to the pair's actual
+frequency in the data, where M{c} is a class and M{f[i]} is the M{i}th
+feature vector element.
+
+                 SUM[t|c[t]=c0] f[i][t]
+freq(c0, f[i]) = -----------------------
+                    SUM[t] f[i][t]
 
-for each pair M{(c, f[i])} is equal to the pair's actual
-frequency in the data, where M{c} is a class and M{f[i]} is a value
-for the M{i}th feature vector element.
 
-"""
-
-# This needs to be updated/fixed:!!
-"""
-                     SUM[t|t[CLASS]=c] t[FEATURE_VECTOR][i]
-actual freq(c, i) = ----------------------------------------
-                          SUM[t] t[FEATURE_VECTOR][i]
-
-
-                  SUM[t] P(t[CLASS]=c) t[FEATURE_VECTOR][i]
-est freq(c, i) = -------------------------------------------
-
-
-                         P(t[c]=c0) t[f][i]
-prob(c0, f[i]) = SUM[t] ---------------------------
-                          SUM[c]
-
-- C{t}: A token
-- C{t[CLASS]}: Token C{t}'s class
-- C{t[FEATURE_VECTOR]}: Token C{t}'s feature vector
+                 SUM[t] SUM[c] P(c[t]=c0) f[i][t]
+prob(c0, f[i]) = ---------------------------------
+                 
+c[t]
+f[i][t]
 
 
 the frequency of each (class, 
@@ -124,9 +111,9 @@
 from nltk import TaskI, PropertyIndirectionMixIn
 import time, types
 
-# Don't use from .. imports, because math and numarray provide
+# Don't use from .. imports, because math and Numeric provide
 # different definitions for useful functions (exp, log, etc.)
-import math, numarray
+import math, Numeric
 
 ##//////////////////////////////////////////////////////
 ##  Maxent Classifier
@@ -346,7 +333,7 @@
 # [XX] requires: features must be encoded with a GISFeatureEncoder!
 class GISMaxentClassifierTrainer(ClassifierTrainerI):
     def _fcount_emperical(self, train_toks):
-        fcount = numarray.zeros(self._weight_vector_len, 'd')
+        fcount = Numeric.zeros(self._weight_vector_len, 'd')
 
         for tok in train_toks:
             feature_vector = tok['FEATURE_VECTOR']
@@ -358,7 +345,7 @@
         return fcount
 
     def _fcount_estimated(self, classifier, train_toks):
-        fcount = numarray.zeros(self._weight_vector_len, 'd')
+        fcount = Numeric.zeros(self._weight_vector_len, 'd')
 
         for tok in train_toks:
             dist = classifier.get_class_probs(tok)
@@ -423,14 +410,14 @@
         # An array that is 1 whenever fcount_emperical is zero.  In
         # other words, it is one for any feature that's not attested
         # in the training data.  This is used to avoid division by zero.
-        unattested = numarray.zeros(len(fcount_emperical))
+        unattested = Numeric.zeros(len(fcount_emperical))
         for i in range(len(fcount_emperical)):
             if fcount_emperical[i] == 0: unattested[i] = 1
 
         # Build the classifier.  Start with weight=1 for each feature,
         # except for the unattested features.  Start those out at
         # zero, since we know that's the correct value.
-        weights = numarray.ones(len(fcount_emperical), 'd')
+        weights = Numeric.ones(len(fcount_emperical), 'd')
         weights -= unattested
         classifier = ConditionalExponentialClassifier(classes, weights)
 
@@ -553,7 +540,7 @@
             emperical frequency for feature M{i}.
         @rtype: C{array} of C{float}
         """
-        fcount = numarray.zeros(self._weight_vector_len, 'd')
+        fcount = Numeric.zeros(self._weight_vector_len, 'd')
 
         for tok in train_toks:
             feature_vector = tok['FEATURE_VECTOR']
@@ -672,12 +659,12 @@
         NEWTON_CONVERGE = 1e-12
         MAX_NEWTON = 30
         
-        deltas = numarray.ones(self._weight_vector_len, 'd')
+        deltas = Numeric.ones(self._weight_vector_len, 'd')
 
         # Precompute the A matrix:
         # A[nf][id] = sum ( p(text) * p(label|text) * f(text,label) )
         # over all label,text s.t. num_features[label,text]=nf
-        A = numarray.zeros((len(nfmap), self._weight_vector_len), 'd')
+        A = Numeric.zeros((len(nfmap), self._weight_vector_len), 'd')
 
         for i, tok in enumerate(train_toks):
             dist = classifier.get_class_probs(tok)
@@ -702,11 +689,11 @@
         #   - sum2[i][nf] = sum p(text)p(label|text)f[i](label,text)
         #                       nf exp(delta[i]nf)
         for rangenum in range(MAX_NEWTON):
-            nf_delta = numarray.outerproduct(nfarray, deltas)
-            exp_nf_delta = numarray.exp(nf_delta)
+            nf_delta = Numeric.outerproduct(nfarray, deltas)
+            exp_nf_delta = Numeric.exp(nf_delta)
             nf_exp_nf_delta = nftranspose * exp_nf_delta
-            sum1 = numarray.sum(exp_nf_delta * A) 
-            sum2 = numarray.sum(nf_exp_nf_delta * A)
+            sum1 = Numeric.sum(exp_nf_delta * A) 
+            sum2 = Numeric.sum(nf_exp_nf_delta * A)
 
             # Avoid division by zero.
             sum2 += unattested
@@ -715,8 +702,8 @@
             deltas -= (ffreq_emperical - sum1) / -sum2
 
             # We can stop once we converge.
-            n_error = (numarray.sum(abs((ffreq_emperical-sum1)))/
-                       numarray.sum(abs(deltas)))
+            n_error = (Numeric.sum(abs((ffreq_emperical-sum1)))/
+                       Numeric.sum(abs(deltas)))
             if n_error < NEWTON_CONVERGE:
                 return deltas
 
@@ -822,20 +809,20 @@
         nfmap = self._nfmap(train_toks)
         nfs = nfmap.items()
         nfs.sort(lambda x,y:cmp(x[1],y[1]))
-        nfarray = numarray.array([nf for (nf, i) in nfs], 'd')
-        nftranspose = numarray.reshape(nfarray, (len(nfarray), 1))
+        nfarray = Numeric.array([nf for (nf, i) in nfs], 'd')
+        nftranspose = Numeric.reshape(nfarray, (len(nfarray), 1))
 
         # An array that is 1 whenever ffreq_emperical is zero.  In
         # other words, it is one for any feature that's not attested
         # in the data.  This is used to avoid division by zero.
-        unattested = numarray.zeros(self._weight_vector_len, 'd')
+        unattested = Numeric.zeros(self._weight_vector_len, 'd')
         for i in range(len(unattested)):
             if ffreq_emperical[i] == 0: unattested[i] = 1
 
         # Build the classifier.  Start with weight=1 for each feature,
         # except for the unattested features.  Start those out at
         # zero, since we know that's the correct value.
-        weights = numarray.ones(self._weight_vector_len, 'd')
+        weights = Numeric.ones(self._weight_vector_len, 'd')
         weights -= unattested
         classifier = ConditionalExponentialClassifier(classes, weights)
                 
@@ -859,7 +846,7 @@
 
             # Use the deltas to update our weights.
             weights = classifier.weights()
-            weights *= numarray.exp(deltas)
+            weights *= Numeric.exp(deltas)
             classifier.set_weights(weights)
                         
             # Check log-likelihood cutoffs.
@@ -963,4 +950,4 @@
             s += '%5s=%.3f' % (val,prob)
         print s + ' ...'
     
-if __name__ == '__main__': demo(1)
+if __name__ == '__main__': demo(30)


Index: .cvsignore
===================================================================
RCS file: /cvs/extras/rpms/python-nltk/FC-3/.cvsignore,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- .cvsignore	17 Nov 2005 05:22:25 -0000	1.2
+++ .cvsignore	9 Nov 2006 19:13:59 -0000	1.3
@@ -1 +1 @@
-nltk-1.4.4.tar.gz
+nltk-1.4.2.tar.gz


Index: python-nltk.spec
===================================================================
RCS file: /cvs/extras/rpms/python-nltk/FC-3/python-nltk.spec,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- python-nltk.spec	8 Nov 2006 17:31:50 -0000	1.4
+++ python-nltk.spec	9 Nov 2006 19:13:59 -0000	1.5
@@ -1,20 +1,21 @@
 %{!?python_sitelib: %define python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print get_python_lib()")}
 
 Name:           python-nltk
-Version:        1.4.4
-Release:        3.1%{?dist}
+Version:        1.4.2
+Release:        2%{?dist}
 Summary:        Natural Language Toolkit
 
 Group:          Development/Libraries
 License:        GPL
 URL:            http://nltk.sf.net/
 Source0:        http://dl.sourceforge.net/nltk/nltk-%{version}.tar.gz
+Patch0:         python-nltk-1.4.2-maxent.patch
 BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
 BuildArch:	noarch
 
 BuildRequires:  python
 Requires:       python-abi = %(%{__python} -c "import sys ; print sys.version[:3]")
-Requires:	python-numarray, tkinter
+Requires:	python-numeric, tkinter
 
 %description
 The Natural Language Toolkit is a Python package that simplifies the
@@ -24,6 +25,7 @@
 
 %prep
 %setup -q -n nltk-%{version}
+%patch0 -p1 -b .maxent
 
 
 %build
@@ -60,14 +62,12 @@
 
 
 %changelog
-* Wed Nov  8 2006 Michel Salim <michel.salim at gmail.com> - 1.4.4-3.1
-- Rebuild with correct source
+* Thu Nov  9 2006 Michel Salim <michel.salim at gmail.com> - 1.4.2-2
+- Package .pyos in compliance with new policy
+- Revert maxent.py to a version that does not use numarray
 
-* Wed Nov  8 2006 Michel Salim <michel.salim at gmail.com> - 1.4.4-3
-- Include *.pyos, for compliance with new Python guidelines
+* Mon Feb  6 2006 Michel Salim <michel.salim at gmail.com> - 1.4.2-1
+- Downgrade to 1.4.2 for FC3 only, to match older version of Python installed
 
-* Mon Oct  9 2006 Michel Salim <michel.salim at gmail.com> - 1.4.4-2
-- Rebuild for FE6
-
-* Sun Sep 11 2005 Michel Salim <michel.salim[AT]gmail.com> - 1.4.4-1
+* Sun Sep 11 2005 Michel Salim <michel.salim at gmail.com> - 1.4.4-1
 - Initial package


Index: sources
===================================================================
RCS file: /cvs/extras/rpms/python-nltk/FC-3/sources,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- sources	8 Nov 2006 17:31:50 -0000	1.4
+++ sources	9 Nov 2006 19:13:59 -0000	1.5
@@ -1 +1 @@
-fb402d42d71844fe4483affa100814e5  nltk-1.4.4.tar.gz
+f24e6ac9c2152e9cf6f9fa09e259d783  nltk-1.4.2.tar.gz