From 65d19946bebd1cc6299e2789cc0fc097d1898e80 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Fri, 6 Sep 2013 16:50:09 +0900 Subject: [PATCH] Compute probability using structure policy. Bug: 6669677 Change-Id: Icd50fc30274cce65ebeb2e7cc8368b59e5cda05a --- .../suggest/core/dicnode/dic_node_utils.cpp | 7 ++++--- .../internal/dic_node_state_prevword.h | 4 ---- .../core/dictionary/bigram_dictionary.cpp | 3 +-- .../suggest/core/dictionary/dictionary.cpp | 2 +- .../core/dictionary/multi_bigram_map.h | 21 ++++++++++--------- .../dictionary_structure_with_buffer_policy.h | 5 ++++- native/jni/src/suggest/core/suggest.cpp | 4 +++- .../dynamic_patricia_trie_policy.cpp | 18 ++++++++++++++-- .../dictionary/dynamic_patricia_trie_policy.h | 4 +++- .../dictionary/patricia_trie_policy.cpp | 18 ++++++++++++++-- .../dictionary/patricia_trie_policy.h | 4 +++- .../dictionary/utils}/probability_utils.h | 0 12 files changed, 62 insertions(+), 28 deletions(-) rename native/jni/src/suggest/{core/dictionary => policyimpl/dictionary/utils}/probability_utils.h (100%) diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index bb54e608e..e81591992 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -21,7 +21,6 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dictionary/multi_bigram_map.h" -#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "utils/char_utils.h" @@ -93,13 +92,15 @@ namespace latinime { if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) { // Note: Normally wordPos comes from the dictionary and should never equal // NOT_A_VALID_WORD_POS. - return ProbabilityUtils::backoff(unigramProbability); + return dictionaryStructurePolicy->getProbability(unigramProbability, + NOT_A_PROBABILITY); } if (multiBigramMap) { return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos, wordPos, unigramProbability); } - return ProbabilityUtils::backoff(unigramProbability); + return dictionaryStructurePolicy->getProbability(unigramProbability, + NOT_A_PROBABILITY); } //////////////// diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h index f437c95f6..9bc96877e 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h @@ -116,10 +116,6 @@ class DicNodeStatePrevWord { return mPrevWordStart; } - int16_t getPrevWordProbability() const { - return mPrevWordProbability; - } - int getPrevWordNodePos() const { return mPrevWordNodePos; } diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index e74a1dbc8..cf1cd8815 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -23,7 +23,6 @@ #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/dictionary.h" -#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "utils/char_utils.h" @@ -131,7 +130,7 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. - const int probability = ProbabilityUtils::computeProbabilityForBigram( + const int probability = mDictionaryStructurePolicy->getProbability( unigramProbability, bigramsIt.getProbability()); addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability, outBigramCodePoints, outputTypes); diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 8418a608a..02ece639c 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -90,7 +90,7 @@ int Dictionary::getProbability(const int *word, int length) const { if (NOT_A_VALID_WORD_POS == pos) { return NOT_A_PROBABILITY; } - return getDictionaryStructurePolicy()->getUnigramProbability(pos); + return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos); } bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const { diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h index fb4a80083..9efe5f6f9 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h @@ -22,7 +22,6 @@ #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/bloom_filter.h" -#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "utils/hash_map_compat.h" @@ -43,11 +42,12 @@ class MultiBigramMap { hash_map_compat::const_iterator mapPosition = mBigramMaps.find(wordPosition); if (mapPosition != mBigramMaps.end()) { - return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability); + return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition, + unigramProbability); } if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { addBigramsForWordPosition(structurePolicy, wordPosition); - return mBigramMaps[wordPosition].getBigramProbability( + return mBigramMaps[wordPosition].getBigramProbability(structurePolicy, nextWordPosition, unigramProbability); } return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition, @@ -82,17 +82,17 @@ class MultiBigramMap { } AK_FORCE_INLINE int getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nextWordPosition, const int unigramProbability) const { + int bigramProbability = NOT_A_PROBABILITY; if (mBloomFilter.isInFilter(nextWordPosition)) { const hash_map_compat::const_iterator bigramProbabilityIt = mBigramMap.find(nextWordPosition); if (bigramProbabilityIt != mBigramMap.end()) { - const int bigramProbability = bigramProbabilityIt->second; - return ProbabilityUtils::computeProbabilityForBigram( - unigramProbability, bigramProbability); + bigramProbability = bigramProbabilityIt->second; } } - return ProbabilityUtils::backoff(unigramProbability); + return structurePolicy->getProbability(unigramProbability, bigramProbability); } private: @@ -111,17 +111,18 @@ class MultiBigramMap { AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary( const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, const int nextWordPosition, const int unigramProbability) { + int bigramProbability = NOT_A_PROBABILITY; const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos); BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), bigramsListPos); while (bigramsIt.hasNext()) { bigramsIt.next(); if (bigramsIt.getBigramPos() == nextWordPosition) { - return ProbabilityUtils::computeProbabilityForBigram( - unigramProbability, bigramsIt.getProbability()); + bigramProbability = bigramsIt.getProbability(); + break; } } - return ProbabilityUtils::backoff(unigramProbability); + return structurePolicy->getProbability(unigramProbability, bigramProbability); } static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 532411509..c8cbbcfdf 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -47,7 +47,10 @@ class DictionaryStructureWithBufferPolicy { virtual int getTerminalNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const = 0; - virtual int getUnigramProbability(const int nodePos) const = 0; + virtual int getProbability(const int unigramProbability, + const int bigramProbability) const = 0; + + virtual int getUnigramProbabilityOfPtNode(const int nodePos) const = 0; virtual int getShortcutPositionOfNode(const int nodePos) const = 0; diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 7d8dd21c5..e788e914a 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -171,7 +171,9 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel); const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) + doubleLetterCost; - const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0; + const bool isPossiblyOffensiveWord = + traverseSession->getDictionaryStructurePolicy()->getProbability( + terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; const bool isExactMatch = terminalDicNode->isExactMatch(); const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); // Heuristic: We exclude freq=0 first-char-uppercase words from exact match. diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 3b9878b82..945677b50 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -24,6 +24,7 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { @@ -134,7 +135,20 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in return NOT_A_VALID_WORD_POS; } -int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const { +int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + // TODO: check mHeaderPolicy.usesForgettingCurve(); + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } +} + +int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_PROBABILITY; } @@ -144,7 +158,7 @@ int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const { if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { return NOT_A_PROBABILITY; } - return nodeReader.getProbability(); + return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY); } int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index 5873d3d65..cdab0e16a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -57,7 +57,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int getTerminalNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const; - int getUnigramProbability(const int nodePos) const; + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getUnigramProbabilityOfPtNode(const int nodePos) const; int getShortcutPositionOfNode(const int nodePos) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp index adcf2dbdf..d5a83a938 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp @@ -21,6 +21,7 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { @@ -306,7 +307,19 @@ int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, } } -int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const { +int PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } +} + +int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_PROBABILITY; } @@ -324,7 +337,8 @@ int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const { return NOT_A_PROBABILITY; } PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); - return PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( + mDictRoot, &pos), NOT_A_PROBABILITY); } int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h index d0567fd85..75d976205 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h @@ -56,7 +56,9 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int getTerminalNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const; - int getUnigramProbability(const int nodePos) const; + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getUnigramProbabilityOfPtNode(const int nodePos) const; int getShortcutPositionOfNode(const int nodePos) const; diff --git a/native/jni/src/suggest/core/dictionary/probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h similarity index 100% rename from native/jni/src/suggest/core/dictionary/probability_utils.h rename to native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h