From c76bbceedc804d1f2988cbf032b530a107a7d561 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Wed, 2 Oct 2013 17:51:12 +0900 Subject: [PATCH] Stochastic decay. Bug: 6669677 Change-Id: Ib2d9228b951c77dab7a8675ce9db60677e87e771 --- .../bigram/dynamic_bigram_list_policy.cpp | 12 +- ...namic_patricia_trie_gc_event_listeners.cpp | 4 +- .../dynamic_patricia_trie_writing_helper.cpp | 2 +- .../utils/forgetting_curve_utils.cpp | 111 ++++++++---------- .../dictionary/utils/forgetting_curve_utils.h | 31 ++--- .../latin/BinaryDictionaryDecayingTests.java | 12 +- 6 files changed, 77 insertions(+), 95 deletions(-) diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp index 9c3dc393f..8753c6eb0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -43,7 +43,7 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const } *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags); *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags); - if (mIsDecayingDict && !ForgettingCurveUtils::isValidBigram(*outProbability)) { + if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) { // This bigram is too weak to output. *outBigramPos = NOT_A_DICT_POS; } else { @@ -261,8 +261,8 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags( bigramFlags); const int probabilityToWrite = mIsDecayingDict ? - ForgettingCurveUtils::getUpdatedBigramProbabilityDelta( - originalProbability, probability) : probability; + ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, + probability) : probability; const BigramListReadWriteUtils::BigramFlags updatedFlags = BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probabilityToWrite); @@ -294,7 +294,7 @@ bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, con int *const writingPos) { // hasNext is false because we are adding a new bigram entry at the end of the bigram list. const int probabilityToWrite = mIsDecayingDict ? - ForgettingCurveUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) : + ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) : probability; return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos, probabilityToWrite, false /* hasNext */, writingPos); @@ -365,9 +365,9 @@ bool DynamicBigramListPolicy::updateProbabilityForDecay( *outRemoved = false; if (mIsDecayingDict) { // Update bigram probability for decaying. - const int newProbability = ForgettingCurveUtils::getBigramProbabilityDeltaToSave( + const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave( BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags)); - if (ForgettingCurveUtils::isValidBigram(newProbability)) { + if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { // Write new probability. const BigramListReadWriteUtils::BigramFlags updatedBigramFlags = BigramListReadWriteUtils::setProbabilityInFlags( diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp index 26ea6ab68..324b53062 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -29,14 +29,14 @@ bool DynamicPatriciaTrieGcEventListeners bool isUselessPtNode = !node->isTerminal(); if (node->isTerminal() && mIsDecayingDict) { const int newProbability = - ForgettingCurveUtils::getUnigramProbabilityToSave(node->getProbability()); + ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability()); int writingPos = node->getProbabilityFieldPos(); // Update probability. if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( mBuffer, newProbability, &writingPos)) { return false; } - if (!ForgettingCurveUtils::isValidUnigram(newProbability)) { + if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { isUselessPtNode = false; } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp index 9dc2abe20..70a9ee564 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -545,7 +545,7 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability, const int newProbability) { if (mNeedsToDecay) { - return ForgettingCurveUtils::getUpdatedUnigramProbability(originalProbability, + return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, newProbability); } else { return newProbability; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index c525a8ab5..62a19a5a6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" @@ -26,106 +28,91 @@ const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000; const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127; -const int ForgettingCurveUtils::MAX_UNIGRAM_PROBABILITY = 120; -const int ForgettingCurveUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24; -const int ForgettingCurveUtils::UNIGRAM_PROBABILITY_STEP = 8; -const int ForgettingCurveUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15; -const int ForgettingCurveUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3; -const int ForgettingCurveUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1; +const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15; +const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3; +const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1; +// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected +// duration of the decay is approximately 66hours. +const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f; /* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability, - const int encodedBigramProbabilityDelta) { + const int encodedBigramProbability) { if (encodedUnigramProbability == NOT_A_PROBABILITY) { return NOT_A_PROBABILITY; - } else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) { - const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability( - encodedUnigramProbability)); - return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY); + } else if (encodedBigramProbability == NOT_A_PROBABILITY) { + return backoff(decodeUnigramProbability(encodedUnigramProbability)); } else { - const int rawProbability = ProbabilityUtils::computeProbabilityForBigram( - decodeUnigramProbability(encodedUnigramProbability), - decodeBigramProbabilityDelta(encodedBigramProbabilityDelta)); - return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY); + const int unigramProbability = decodeUnigramProbability(encodedUnigramProbability); + const int bigramProbability = decodeBigramProbability(encodedBigramProbability); + return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); } } -/* static */ int ForgettingCurveUtils::getUpdatedUnigramProbability( +// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding +// (i.e. unigram probability + bigram probability delta). +/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability( const int originalEncodedProbability, const int newProbability) { if (originalEncodedProbability == NOT_A_PROBABILITY) { - // The unigram is not in this dictionary. - if (newProbability == NOT_A_PROBABILITY) { - // The unigram is not in other dictionaries. - return 0; - } else { - return MIN_VALID_UNIGRAM_PROBABILITY; - } - } else { - if (newProbability != NOT_A_PROBABILITY - && originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) { - return MIN_VALID_UNIGRAM_PROBABILITY; - } - return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY); - } -} - -/* static */ int ForgettingCurveUtils::getUnigramProbabilityToSave(const int encodedProbability) { - return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0); -} - -/* static */ int ForgettingCurveUtils::getBigramProbabilityDeltaToSave( - const int encodedProbabilityDelta) { - return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0); -} - -/* static */ int ForgettingCurveUtils::getUpdatedBigramProbabilityDelta( - const int originalEncodedProbabilityDelta, const int newProbability) { - if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) { // The bigram relation is not in this dictionary. if (newProbability == NOT_A_PROBABILITY) { // The bigram target is not in other dictionaries. return 0; } else { - return MIN_VALID_BIGRAM_PROBABILITY_DELTA; + return MIN_VALID_ENCODED_PROBABILITY; } } else { if (newProbability != NOT_A_PROBABILITY - && originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) { - return MIN_VALID_BIGRAM_PROBABILITY_DELTA; + && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) { + return MIN_VALID_ENCODED_PROBABILITY; } - return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP, - MAX_BIGRAM_PROBABILITY_DELTA); + return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY); } } -/* static */ int ForgettingCurveUtils::isValidUnigram(const int encodedUnigramProbability) { - return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY; +/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) { + return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY; } -/* static */ int ForgettingCurveUtils::isValidBigram(const int encodedBigramProbabilityDelta) { - return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA; +/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability) { + const int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); + // TODO: Implement the decay in more proper way. + const float currentRate = static_cast(currentEncodedProbability) + / static_cast(MAX_ENCODED_PROBABILITY); + const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY + + (1.0f - MIN_PROBABILITY_TO_DECAY) * (1.0f - currentRate); + const float randValue = static_cast(rand()) / static_cast(RAND_MAX); + if (thresholdToDecay < randValue) { + return max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, 0); + } else { + return currentEncodedProbability; + } } /* static */ int ForgettingCurveUtils::decodeUnigramProbability(const int encodedProbability) { - const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY; + const int probability = encodedProbability - MIN_VALID_ENCODED_PROBABILITY; if (probability < 0) { return NOT_A_PROBABILITY; } else { - return min(probability, MAX_UNIGRAM_PROBABILITY); + return min(probability, MAX_ENCODED_PROBABILITY) * 8; } } -/* static */ int ForgettingCurveUtils::decodeBigramProbabilityDelta( - const int encodedProbabilityDelta) { - const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA; - if (probabilityDelta < 0) { +/* static */ int ForgettingCurveUtils::decodeBigramProbability(const int encodedProbability) { + const int probability = encodedProbability - MIN_VALID_ENCODED_PROBABILITY; + if (probability < 0) { return NOT_A_PROBABILITY; } else { - return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA); + return min(probability, MAX_ENCODED_PROBABILITY) * 8; } } -/* static */ int ForgettingCurveUtils::getDecayedProbability(const int rawProbability) { - return rawProbability; +// See comments in ProbabilityUtils::backoff(). +/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else { + return max(unigramProbability - 8, 0); + } } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index 11ffb2e2f..281f76a9c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -24,7 +24,6 @@ namespace latinime { // TODO: Check the elapsed time and decrease the probability depending on the time. Time field is // required to introduced to each terminal PtNode and bigram entry. // TODO: Quit using bigram probability to indicate the delta. -// TODO: Quit using bigram probability delta. class ForgettingCurveUtils { public: static const int MAX_UNIGRAM_COUNT; @@ -33,38 +32,30 @@ class ForgettingCurveUtils { static const int MAX_BIGRAM_COUNT_AFTER_GC; static int getProbability(const int encodedUnigramProbability, - const int encodedBigramProbabilityDelta); + const int encodedBigramProbability); - static int getUpdatedUnigramProbability(const int originalEncodedProbability, + static int getUpdatedEncodedProbability(const int originalEncodedProbability, const int newProbability); - static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta, - const int newProbability); + static int isValidEncodedProbability(const int encodedProbability); - static int isValidUnigram(const int encodedUnigramProbability); - - static int isValidBigram(const int encodedProbabilityDelta); - - static int getUnigramProbabilityToSave(const int encodedProbability); - - static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta); + static int getEncodedProbabilityToSave(const int encodedProbability); private: DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); static const int MAX_COMPUTED_PROBABILITY; - static const int MAX_UNIGRAM_PROBABILITY; - static const int MIN_VALID_UNIGRAM_PROBABILITY; - static const int UNIGRAM_PROBABILITY_STEP; - static const int MAX_BIGRAM_PROBABILITY_DELTA; - static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA; - static const int BIGRAM_PROBABILITY_DELTA_STEP; + static const int MAX_ENCODED_PROBABILITY; + static const int MIN_VALID_ENCODED_PROBABILITY; + static const int ENCODED_PROBABILITY_STEP; + + static const float MIN_PROBABILITY_TO_DECAY; static int decodeUnigramProbability(const int encodedProbability); - static int decodeBigramProbabilityDelta(const int encodedProbability); + static int decodeBigramProbability(const int encodedProbability); - static int getDecayedProbability(const int rawProbability); + static int backoff(const int unigramProbability); }; } // namespace latinime #endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index 8a439fc22..b2d31c21f 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -50,14 +50,18 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { } private void forcePassingShortTime(final BinaryDictionary binaryDictionary) { - binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY); - binaryDictionary.flushWithGC(); + // Entries having low probability would be suppressed once in 2 GCs. + final int count = 2; + for (int i = 0; i < count; i++) { + binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY); + binaryDictionary.flushWithGC(); + } } private void forcePassingLongTime(final BinaryDictionary binaryDictionary) { // Currently, probabilities are decayed when GC is run. All entries that have never been - // typed in 32 GCs are removed. - final int count = 32; + // typed in 128 GCs would be removed. + final int count = 128; for (int i = 0; i < count; i++) { binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY); binaryDictionary.flushWithGC();