rpms/python-nltk/FC-3 python-nltk-1.4.2-maxent.patch, NONE, 1.1 .cvsignore, 1.2, 1.3 python-nltk.spec, 1.4, 1.5 sources, 1.4, 1.5
Michel Alexandre Salim (salimma)
fedora-extras-commits at redhat.com
Thu Nov 9 19:14:29 UTC 2006
Author: salimma
Update of /cvs/extras/rpms/python-nltk/FC-3
In directory cvs-int.fedora.redhat.com:/tmp/cvs-serv19025
Modified Files:
.cvsignore python-nltk.spec sources
Added Files:
python-nltk-1.4.2-maxent.patch
Log Message:
- Updated version 1.4.2 for FE3, with a patch to revert maxent.py to the version that does not require numarray
python-nltk-1.4.2-maxent.patch:
--- NEW FILE python-nltk-1.4.2-maxent.patch ---
--- nltk-1.4.2/nltk/classifier/maxent.py.orig 2004-07-19 06:43:49.000000000 -0400
+++ nltk-1.4.2/nltk/classifier/maxent.py 2006-11-09 14:10:32.000000000 -0500
@@ -5,7 +5,7 @@
# URL: <http://nltk.sf.net>
# For license information, see LICENSE.TXT
#
-# $Id: maxent.py,v 1.32 2004/07/19 05:34:37 edloper Exp $
+# $Id: maxent.py,v 1.31 2004/07/17 21:03:19 edloper Exp $
"""
@@ -14,33 +14,20 @@
emperically consistant with the training data; and chooses the
distribution with the highest entropy. A probability distribution is
X{emperically consistant} with a set of training data if its estimated
-frequency with which a class and a feature vector value co-occur is
-equal to the actual frequency in the data.
+frequency for each pair M{(c, f[i])} is equal to the pair's actual
+frequency in the data, where M{c} is a class and M{f[i]} is the M{i}th
+feature vector element.
+
+ SUM[t|c[t]=c0] f[i][t]
+freq(c0, f[i]) = -----------------------
+ SUM[t] f[i][t]
-for each pair M{(c, f[i])} is equal to the pair's actual
-frequency in the data, where M{c} is a class and M{f[i]} is a value
-for the M{i}th feature vector element.
-"""
-
-# This needs to be updated/fixed:!!
-"""
- SUM[t|t[CLASS]=c] t[FEATURE_VECTOR][i]
-actual freq(c, i) = ----------------------------------------
- SUM[t] t[FEATURE_VECTOR][i]
-
-
- SUM[t] P(t[CLASS]=c) t[FEATURE_VECTOR][i]
-est freq(c, i) = -------------------------------------------
-
-
- P(t[c]=c0) t[f][i]
-prob(c0, f[i]) = SUM[t] ---------------------------
- SUM[c]
-
-- C{t}: A token
-- C{t[CLASS]}: Token C{t}'s class
-- C{t[FEATURE_VECTOR]}: Token C{t}'s feature vector
+ SUM[t] SUM[c] P(c[t]=c0) f[i][t]
+prob(c0, f[i]) = ---------------------------------
+
+c[t]
+f[i][t]
the frequency of each (class,
@@ -124,9 +111,9 @@
from nltk import TaskI, PropertyIndirectionMixIn
import time, types
-# Don't use from .. imports, because math and numarray provide
+# Don't use from .. imports, because math and Numeric provide
# different definitions for useful functions (exp, log, etc.)
-import math, numarray
+import math, Numeric
##//////////////////////////////////////////////////////
## Maxent Classifier
@@ -346,7 +333,7 @@
# [XX] requires: features must be encoded with a GISFeatureEncoder!
class GISMaxentClassifierTrainer(ClassifierTrainerI):
def _fcount_emperical(self, train_toks):
- fcount = numarray.zeros(self._weight_vector_len, 'd')
+ fcount = Numeric.zeros(self._weight_vector_len, 'd')
for tok in train_toks:
feature_vector = tok['FEATURE_VECTOR']
@@ -358,7 +345,7 @@
return fcount
def _fcount_estimated(self, classifier, train_toks):
- fcount = numarray.zeros(self._weight_vector_len, 'd')
+ fcount = Numeric.zeros(self._weight_vector_len, 'd')
for tok in train_toks:
dist = classifier.get_class_probs(tok)
@@ -423,14 +410,14 @@
# An array that is 1 whenever fcount_emperical is zero. In
# other words, it is one for any feature that's not attested
# in the training data. This is used to avoid division by zero.
- unattested = numarray.zeros(len(fcount_emperical))
+ unattested = Numeric.zeros(len(fcount_emperical))
for i in range(len(fcount_emperical)):
if fcount_emperical[i] == 0: unattested[i] = 1
# Build the classifier. Start with weight=1 for each feature,
# except for the unattested features. Start those out at
# zero, since we know that's the correct value.
- weights = numarray.ones(len(fcount_emperical), 'd')
+ weights = Numeric.ones(len(fcount_emperical), 'd')
weights -= unattested
classifier = ConditionalExponentialClassifier(classes, weights)
@@ -553,7 +540,7 @@
emperical frequency for feature M{i}.
@rtype: C{array} of C{float}
"""
- fcount = numarray.zeros(self._weight_vector_len, 'd')
+ fcount = Numeric.zeros(self._weight_vector_len, 'd')
for tok in train_toks:
feature_vector = tok['FEATURE_VECTOR']
@@ -672,12 +659,12 @@
NEWTON_CONVERGE = 1e-12
MAX_NEWTON = 30
- deltas = numarray.ones(self._weight_vector_len, 'd')
+ deltas = Numeric.ones(self._weight_vector_len, 'd')
# Precompute the A matrix:
# A[nf][id] = sum ( p(text) * p(label|text) * f(text,label) )
# over all label,text s.t. num_features[label,text]=nf
- A = numarray.zeros((len(nfmap), self._weight_vector_len), 'd')
+ A = Numeric.zeros((len(nfmap), self._weight_vector_len), 'd')
for i, tok in enumerate(train_toks):
dist = classifier.get_class_probs(tok)
@@ -702,11 +689,11 @@
# - sum2[i][nf] = sum p(text)p(label|text)f[i](label,text)
# nf exp(delta[i]nf)
for rangenum in range(MAX_NEWTON):
- nf_delta = numarray.outerproduct(nfarray, deltas)
- exp_nf_delta = numarray.exp(nf_delta)
+ nf_delta = Numeric.outerproduct(nfarray, deltas)
+ exp_nf_delta = Numeric.exp(nf_delta)
nf_exp_nf_delta = nftranspose * exp_nf_delta
- sum1 = numarray.sum(exp_nf_delta * A)
- sum2 = numarray.sum(nf_exp_nf_delta * A)
+ sum1 = Numeric.sum(exp_nf_delta * A)
+ sum2 = Numeric.sum(nf_exp_nf_delta * A)
# Avoid division by zero.
sum2 += unattested
@@ -715,8 +702,8 @@
deltas -= (ffreq_emperical - sum1) / -sum2
# We can stop once we converge.
- n_error = (numarray.sum(abs((ffreq_emperical-sum1)))/
- numarray.sum(abs(deltas)))
+ n_error = (Numeric.sum(abs((ffreq_emperical-sum1)))/
+ Numeric.sum(abs(deltas)))
if n_error < NEWTON_CONVERGE:
return deltas
@@ -822,20 +809,20 @@
nfmap = self._nfmap(train_toks)
nfs = nfmap.items()
nfs.sort(lambda x,y:cmp(x[1],y[1]))
- nfarray = numarray.array([nf for (nf, i) in nfs], 'd')
- nftranspose = numarray.reshape(nfarray, (len(nfarray), 1))
+ nfarray = Numeric.array([nf for (nf, i) in nfs], 'd')
+ nftranspose = Numeric.reshape(nfarray, (len(nfarray), 1))
# An array that is 1 whenever ffreq_emperical is zero. In
# other words, it is one for any feature that's not attested
# in the data. This is used to avoid division by zero.
- unattested = numarray.zeros(self._weight_vector_len, 'd')
+ unattested = Numeric.zeros(self._weight_vector_len, 'd')
for i in range(len(unattested)):
if ffreq_emperical[i] == 0: unattested[i] = 1
# Build the classifier. Start with weight=1 for each feature,
# except for the unattested features. Start those out at
# zero, since we know that's the correct value.
- weights = numarray.ones(self._weight_vector_len, 'd')
+ weights = Numeric.ones(self._weight_vector_len, 'd')
weights -= unattested
classifier = ConditionalExponentialClassifier(classes, weights)
@@ -859,7 +846,7 @@
# Use the deltas to update our weights.
weights = classifier.weights()
- weights *= numarray.exp(deltas)
+ weights *= Numeric.exp(deltas)
classifier.set_weights(weights)
# Check log-likelihood cutoffs.
@@ -963,4 +950,4 @@
s += '%5s=%.3f' % (val,prob)
print s + ' ...'
-if __name__ == '__main__': demo(1)
+if __name__ == '__main__': demo(30)
Index: .cvsignore
===================================================================
RCS file: /cvs/extras/rpms/python-nltk/FC-3/.cvsignore,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- .cvsignore 17 Nov 2005 05:22:25 -0000 1.2
+++ .cvsignore 9 Nov 2006 19:13:59 -0000 1.3
@@ -1 +1 @@
-nltk-1.4.4.tar.gz
+nltk-1.4.2.tar.gz
Index: python-nltk.spec
===================================================================
RCS file: /cvs/extras/rpms/python-nltk/FC-3/python-nltk.spec,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- python-nltk.spec 8 Nov 2006 17:31:50 -0000 1.4
+++ python-nltk.spec 9 Nov 2006 19:13:59 -0000 1.5
@@ -1,20 +1,21 @@
%{!?python_sitelib: %define python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print get_python_lib()")}
Name: python-nltk
-Version: 1.4.4
-Release: 3.1%{?dist}
+Version: 1.4.2
+Release: 2%{?dist}
Summary: Natural Language Toolkit
Group: Development/Libraries
License: GPL
URL: http://nltk.sf.net/
Source0: http://dl.sourceforge.net/nltk/nltk-%{version}.tar.gz
+Patch0: python-nltk-1.4.2-maxent.patch
BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
BuildArch: noarch
BuildRequires: python
Requires: python-abi = %(%{__python} -c "import sys ; print sys.version[:3]")
-Requires: python-numarray, tkinter
+Requires: python-numeric, tkinter
%description
The Natural Language Toolkit is a Python package that simplifies the
@@ -24,6 +25,7 @@
%prep
%setup -q -n nltk-%{version}
+%patch0 -p1 -b .maxent
%build
@@ -60,14 +62,12 @@
%changelog
-* Wed Nov 8 2006 Michel Salim <michel.salim at gmail.com> - 1.4.4-3.1
-- Rebuild with correct source
+* Thu Nov 9 2006 Michel Salim <michel.salim at gmail.com> - 1.4.2-2
+- Package .pyos in compliance with new policy
+- Revert maxent.py to a version that does not use numarray
-* Wed Nov 8 2006 Michel Salim <michel.salim at gmail.com> - 1.4.4-3
-- Include *.pyos, for compliance with new Python guidelines
+* Mon Feb 6 2006 Michel Salim <michel.salim at gmail.com> - 1.4.2-1
+- Downgrade to 1.4.2 for FC3 only, to match older version of Python installed
-* Mon Oct 9 2006 Michel Salim <michel.salim at gmail.com> - 1.4.4-2
-- Rebuild for FE6
-
-* Sun Sep 11 2005 Michel Salim <michel.salim[AT]gmail.com> - 1.4.4-1
+* Sun Sep 11 2005 Michel Salim <michel.salim at gmail.com> - 1.4.4-1
- Initial package
Index: sources
===================================================================
RCS file: /cvs/extras/rpms/python-nltk/FC-3/sources,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- sources 8 Nov 2006 17:31:50 -0000 1.4
+++ sources 9 Nov 2006 19:13:59 -0000 1.5
@@ -1 +1 @@
-fb402d42d71844fe4483affa100814e5 nltk-1.4.4.tar.gz
+f24e6ac9c2152e9cf6f9fa09e259d783 nltk-1.4.2.tar.gz
More information about the scm-commits
mailing list