[libpinyin] increase train factor

Peng Wu pwu at fedoraproject.org
Fri Nov 25 07:25:48 UTC 2011


commit 27618325a2fa64d5195528853831b41e3b77ef11
Author: Peng Wu <alexepico at gmail.com>
Date:   Fri Nov 25 15:25:31 2011 +0800

    increase train factor

 libpinyin-0.3.x-head.patch |   86 ++++++++++++++++++++++++++++++++++++++++++++
 libpinyin.spec             |    9 +++--
 2 files changed, 92 insertions(+), 3 deletions(-)
---
diff --git a/libpinyin-0.3.x-head.patch b/libpinyin-0.3.x-head.patch
index e69de29..4b1512a 100644
--- a/libpinyin-0.3.x-head.patch
+++ b/libpinyin-0.3.x-head.patch
@@ -0,0 +1,86 @@
+From f332a01334342bdd4169324bdf889386ff3676fa Mon Sep 17 00:00:00 2001
+From: Peng Wu <alexepico at gmail.com>
+Date: Thu, 24 Nov 2011 13:02:10 +0800
+Subject: [PATCH 1/3] increase train_factor because of larger model data
+
+---
+ src/lookup/pinyin_lookup.cpp |    4 ++--
+ 1 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp
+index d6ba68c..7146e51 100644
+--- a/src/lookup/pinyin_lookup.cpp
++++ b/src/lookup/pinyin_lookup.cpp
+@@ -449,7 +449,7 @@ bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints const
+     //TODO: verify the new training method.
+     phrase_token_t last_token = sentence_start;
+     // constraints->len + 1 == results->len
+-    guint32 train_factor = 23;
++    guint32 train_factor = 23 * 5;
+     for ( size_t i = 0; i < constraints->len; ++i){
+ 	phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ 	if ( *token == null_token )
+@@ -466,7 +466,7 @@ bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints const
+ 	    //std::cout<<"i:"<<i<<"last_token:"<<last_token<<"\ttoken:"<<*token<<std::endl;
+ 	    m_phrase_index->get_phrase_item(*token, m_cache_phrase_item);
+ 	    m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor);
+-	    m_phrase_index->add_unigram_frequency(*token, train_factor);
++	    m_phrase_index->add_unigram_frequency(*token, train_factor * 10);
+ 	    if ( last_token ){
+ 		SingleGram * system, *user;
+ 		m_system_bigram->load(last_token, system);
+-- 
+1.7.7.3
+
+
+From de8057576011eb536d87194da10c9ec48dd8d092 Mon Sep 17 00:00:00 2001
+From: Peng Wu <alexepico at gmail.com>
+Date: Fri, 25 Nov 2011 14:58:45 +0800
+Subject: [PATCH 2/3] add const modifiers to train factor
+
+---
+ src/lookup/pinyin_lookup.cpp |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp
+index 7146e51..e2f563c 100644
+--- a/src/lookup/pinyin_lookup.cpp
++++ b/src/lookup/pinyin_lookup.cpp
+@@ -449,7 +449,7 @@ bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints const
+     //TODO: verify the new training method.
+     phrase_token_t last_token = sentence_start;
+     // constraints->len + 1 == results->len
+-    guint32 train_factor = 23 * 5;
++    const guint32 train_factor = 23 * 5;
+     for ( size_t i = 0; i < constraints->len; ++i){
+ 	phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
+ 	if ( *token == null_token )
+-- 
+1.7.7.3
+
+
+From 47dca981b4d0f155f80087ee892bd2ff80429e7c Mon Sep 17 00:00:00 2001
+From: Peng Wu <alexepico at gmail.com>
+Date: Fri, 25 Nov 2011 15:18:42 +0800
+Subject: [PATCH 3/3] update lambda parameter
+
+---
+ src/include/novel_types.h |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/src/include/novel_types.h b/src/include/novel_types.h
+index 1c4fb2b..110d041 100644
+--- a/src/include/novel_types.h
++++ b/src/include/novel_types.h
+@@ -144,7 +144,7 @@ typedef guint32 table_offset_t;
+ 
+ typedef double parameter_t;
+ 
+-#define LAMBDA_PARAMETER 0.588792
++#define LAMBDA_PARAMETER 0.330642
+ 
+ /* Array of phrase_token_t */
+ typedef GArray * TokenVector;
+-- 
+1.7.7.3
+
diff --git a/libpinyin.spec b/libpinyin.spec
index a558b9a..af3d135 100644
--- a/libpinyin.spec
+++ b/libpinyin.spec
@@ -1,12 +1,12 @@
 Name:           libpinyin
 Version:        0.3.0
-Release:        1%{?dist}
+Release:        2%{?dist}
 Summary:        Library to deal with pinyin
 
 License:        GPLv2+
 URL:            https://github.com/libpinyin/libpinyin
 Source0:        https://github.com/downloads/libpinyin/libpinyin/%{name}-%{version}.tar.gz
-#Patch0:         libpinyin-0.3.x-head.patch
+Patch0:         libpinyin-0.3.x-head.patch
 
 BuildRequires:  db4-devel, glib2-devel
 Requires:       %{name}-data = %{version}-%{release}
@@ -35,7 +35,7 @@ The %{name}-data package contains data files.
 
 %prep
 %setup -q
-#%patch0 -p1 -b .head
+%patch0 -p1 -b .head
 
 
 %build
@@ -70,6 +70,9 @@ find $RPM_BUILD_ROOT -name '*.la' -exec rm -f {} ';'
 %{_datadir}/libpinyin/data
 
 %changelog
+* Fri Nov 25 2011  Peng Wu <pwu at redhat.com> - 0.3.0-2
+- Increase train factor
+
 * Fri Nov 18 2011  Peng Wu <pwu at redhat.com> - 0.3.0-1
 - Update to 0.3.0
 


More information about the scm-commits mailing list