From 9955716d0bb8b370eeab3bb4c9ab3108c45c7289 Mon Sep 17 00:00:00 2001 From: satok Date: Thu, 26 Jan 2012 22:49:13 +0900 Subject: [PATCH] Merge missing space and mistyped space correction algorithm Change-Id: Idd64d38d3d29be24748f9c0359667883698a5756 --- native/src/correction.cpp | 29 ++---- native/src/correction.h | 8 +- native/src/defines.h | 5 +- native/src/unigram_dictionary.cpp | 158 ++++++++++++++---------------- native/src/unigram_dictionary.h | 17 +--- 5 files changed, 93 insertions(+), 124 deletions(-) diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 2458bca86..ee5023532 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -158,10 +158,10 @@ void Correction::checkState() { } } -int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const unsigned short *word) { - return Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - firstFreq, secondFreq, this, word); +int Correction::getFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray, + const bool isSpaceProximity, const unsigned short *word) { + return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(freqArray, wordLengthArray, this, + isSpaceProximity, word); } int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { @@ -806,21 +806,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const /* static */ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - const int firstFreq, const int secondFreq, const Correction* correction, - const unsigned short *word) { - const int spaceProximityPos = correction->mSpaceProximityPos; - const int missingSpacePos = correction->mMissingSpacePos; - if (DEBUG_DICT) { - int inputCount = 0; - if (spaceProximityPos >= 0) ++inputCount; - if (missingSpacePos >= 0) ++inputCount; - assert(inputCount <= 1); - } - const bool isSpaceProximity = spaceProximityPos >= 0; - const int inputLength = correction->mInputLength; - const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) - : (inputLength - missingSpacePos); + const int *freqArray, const int *wordLengthArray, const Correction* correction, + const bool isSpaceProximity, const unsigned short *word) { + const int firstFreq = freqArray[0]; + const int secondFreq = freqArray[1]; + const int firstWordLength = wordLengthArray[0]; + const int secondWordLength = wordLengthArray[1]; const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; bool firstCapitalizedWordDemotion = false; diff --git a/native/src/correction.h b/native/src/correction.h index aec7bbd73..b246070fe 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -122,7 +122,8 @@ class Correction { bool needsToPrune() const; int getFreqForSplitTwoWords( - const int firstFreq, const int secondFreq, const unsigned short *word); + const int *freqArray, const int *wordLengthArray, const bool isSpaceProximity, + const unsigned short *word); int getFinalFreq(const int freq, unsigned short **word, int* wordLength); int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength, const int inputLength); @@ -150,8 +151,9 @@ class Correction { static int calculateFinalFreq(const int inputIndex, const int depth, const int freq, int *editDistanceTable, const Correction* correction, const int inputLength); - static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const Correction* correction, const unsigned short *word); + static int calcFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray, + const Correction* correction, const bool isSpaceProximity, + const unsigned short *word); static double calcNormalizedScore(const unsigned short* before, const int beforeLength, const unsigned short* after, const int afterLength, const int score); static int editDistance(const unsigned short* before, diff --git a/native/src/defines.h b/native/src/defines.h index 7e171acfd..c25f963e0 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -180,10 +180,9 @@ static void prof_out(void) { #define CALIBRATE_SCORE_BY_TOUCH_COORDINATES true #define SUGGEST_WORDS_WITH_MISSING_CHARACTER true -#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true #define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true #define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true -#define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true +#define SUGGEST_MULTIPLE_WORDS true // The following "rate"s are used as a multiplier before dividing by 100, so they are in percent. #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80 @@ -233,7 +232,7 @@ static void prof_out(void) { // Minimum suggest depth for one word for all cases except for missing space suggestions. #define MIN_SUGGEST_DEPTH 1 -#define MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION 3 +#define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 #define min(a,b) ((a)<(b)?(a):(b)) diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index fd6f14af8..8b1a25d90 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -211,7 +211,6 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_END(3); PROF_START(4); - // Note: This line is intentionally left blank bool hasAutoCorrectionCandidate = false; WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); if (masterQueue->size() > 0) { @@ -222,14 +221,14 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_END(4); PROF_START(5); - // Suggestions with missing space - if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER - && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) { + // Multiple word suggestions + if (SUGGEST_MULTIPLE_WORDS + && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) { for (int i = 1; i < inputLength; ++i) { if (DEBUG_DICT) { - AKLOGI("--- Suggest missing space characters %d", i); + AKLOGI("--- Suggest multiple words %d", i); } - getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, + getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, i, correction, queuePool, hasAutoCorrectionCandidate); } @@ -237,26 +236,9 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_END(5); PROF_START(6); - if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) { - // The first and last "mistyped spaces" are taken care of by excessive character handling - for (int i = 1; i < inputLength - 1; ++i) { - if (DEBUG_DICT) { - AKLOGI("--- Suggest words with proximity space %d", i); - } - const int x = xcoordinates[i]; - const int y = ycoordinates[i]; - if (DEBUG_PROXIMITY_INFO) { - AKLOGI("Input[%d] x = %d, y = %d, has space proximity = %d", - i, x, y, proximityInfo->hasSpaceProximity(x, y)); - } - if (proximityInfo->hasSpaceProximity(x, y)) { - getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, inputLength, i, correction, queuePool, - hasAutoCorrectionCandidate); - } - } - } + // Note: This line is intentionally left blank PROF_END(6); + if (DEBUG_DICT) { queuePool->dumpSubQueue1TopSuggestions(); for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { @@ -337,24 +319,6 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, } } -void UnigramDictionary::getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const bool useFullEditDistance, - const int inputLength, const int missingSpacePos, Correction *correction, - WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { - getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */, - correction, queuePool, hasAutoCorrectionCandidate); -} - -void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const bool useFullEditDistance, - const int inputLength, const int spaceProximityPos, Correction *correction, - WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { - getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos, - correction, queuePool, hasAutoCorrectionCandidate); -} - inline void UnigramDictionary::onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, @@ -405,15 +369,23 @@ inline void UnigramDictionary::onTerminal(const int freq, } } -int UnigramDictionary::getSubStringSuggestion( +bool UnigramDictionary::getSubStringSuggestion( ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool* queuePool, const int inputLength, const bool hasAutoCorrectionCandidate, const int currentWordIndex, const int inputWordStartPos, const int inputWordLength, - const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength) { + const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, + int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) { + if (DEBUG_DICT) { + assert(currentWordIndex >= 1); + } unsigned short* tempOutputWord = 0; int tempOutputWordLength = 0; + // TODO: Optimize init suggestion + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, + inputLength, correction); + int freq = getMostFrequentWordLike( inputWordStartPos, inputWordLength, proximityInfo, mWord); if (freq > 0) { @@ -438,7 +410,7 @@ int UnigramDictionary::getSubStringSuggestion( } WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength); if (!queue || queue->size() < 1) { - return 0; + return false; } int score = 0; const double ns = queue->getHighestNormalizedScore( @@ -451,91 +423,103 @@ int UnigramDictionary::getSubStringSuggestion( // threshold. if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD || tempOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { - return 0; + return false; } freq = score >> (tempOutputWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); } if (DEBUG_DICT) { - AKLOGI("Freq(%d): %d", currentWordIndex, freq); + AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d" + , currentWordIndex, freq, tempOutputWordLength, inputWordLength, inputWordStartPos); } if (freq <= 0 || tempOutputWordLength <= 0 || MAX_WORD_LENGTH <= (outputWordStartPos + tempOutputWordLength)) { - return 0; + return false; } for (int i = 0; i < tempOutputWordLength; ++i) { outputWord[outputWordStartPos + i] = tempOutputWord[i]; } + + // Put output values + freqArray[currentWordIndex - 1] = freq; + // TODO: put output length instead of input length + wordLengthArray[currentWordIndex - 1] = inputWordLength; + *outputWordLength = outputWordStartPos + tempOutputWordLength; + if ((inputWordStartPos + inputWordLength) < inputLength) { if (outputWordStartPos + tempOutputWordLength >= MAX_WORD_LENGTH) { - return 0; + return false; } outputWord[outputWordStartPos + tempOutputWordLength] = SPACE; - ++tempOutputWordLength; + ++*outputWordLength; + } else if (currentWordIndex >= 2) { + // TODO: Handle 3 or more words + const int pairFreq = correction->getFreqForSplitTwoWords( + freqArray, wordLengthArray, isSpaceProximity, outputWord); + if (DEBUG_DICT) { + AKLOGI("Split two words: %d, %d, %d, %d", freqArray[0], freqArray[1], pairFreq, + inputLength); + } + addWord(outputWord, *outputWordLength, pairFreq, queuePool->getMasterQueue()); } - *outputWordLength = outputWordStartPos + tempOutputWordLength; - return freq; + return true; } void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, const int missingSpacePos, - const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool, + const bool useFullEditDistance, const int inputLength, const int wordDivideIndex, + Correction *correction, WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { if (inputLength >= MAX_WORD_LENGTH) return; if (DEBUG_DICT) { - int inputCount = 0; - if (spaceProximityPos >= 0) ++inputCount; - if (missingSpacePos >= 0) ++inputCount; - assert(inputCount <= 1); // MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16 assert(MAX_PROXIMITY_CHARS == 16); } - initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, - inputLength, correction); - // Allocating fixed length array on stack unsigned short outputWord[MAX_WORD_LENGTH]; + int freqArray[SUB_QUEUE_MAX_WORD_INDEX]; + int wordLengthArray[SUB_QUEUE_MAX_WORD_INDEX]; int outputWordLength = 0; - WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); - const bool isSpaceProximity = spaceProximityPos >= 0; - // First word int inputWordStartPos = 0; - int inputWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int firstFreq = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + int inputWordLength = wordDivideIndex; + if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, - FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, outputWord, &outputWordLength); - if (firstFreq <= 0) { + FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */, + freqArray, wordLengthArray, outputWord, &outputWordLength)) { return; } + const int tempOutputWordLength = outputWordLength; // Second word - inputWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; - inputWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) - : (inputLength - missingSpacePos); - const int secondFreq = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + // Missing space + inputWordStartPos = wordDivideIndex; + inputWordLength = inputLength - wordDivideIndex; + getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, - SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, outputWordLength, outputWord, - &outputWordLength); - if (secondFreq <= 0) { + SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength, + false /* missing space */, freqArray, wordLengthArray, outputWord, &outputWordLength); + + // Mistyped space + ++inputWordStartPos; + --inputWordLength; + + if (inputWordLength <= 0) { return; } - // TODO: Remove initSuggestions and correction->setCorrectionParams - initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); - - correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, - -1 /* transposedPos */, spaceProximityPos, missingSpacePos, - useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); - const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, outputWord); - if (DEBUG_DICT) { - AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); + const int x = xcoordinates[inputWordStartPos - 1]; + const int y = ycoordinates[inputWordStartPos - 1]; + if (!proximityInfo->hasSpaceProximity(x, y)) { + return; } - addWord(outputWord, outputWordLength, pairFreq, masterQueue); - return; + + getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength, + true /* mistyped space */, freqArray, wordLengthArray, outputWord, &outputWordLength); } // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 0f50ccbd8..79793d676 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -103,17 +103,9 @@ class UnigramDictionary { const int currentWordIndex); void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, - const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool, + const bool useFullEditDistance, const int inputLength, const int wordDivideIndex, + Correction *correction, WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); - void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const bool useFullEditDistance, - const int inputLength, const int missingSpacePos, Correction *correction, - WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); - void getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const bool useFullEditDistance, - const int inputLength, const int spaceProximityPos, Correction *correction, - WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, const int currentWordIndex); @@ -127,13 +119,14 @@ class UnigramDictionary { ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, short unsigned int *outWord); - int getSubStringSuggestion( + bool getSubStringSuggestion( ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool* queuePool, const int inputLength, const bool hasAutoCorrectionCandidate, const int currentWordIndex, const int inputWordStartPos, const int inputWordLength, - const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength); + const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, + int *wordLengthArray, unsigned short* outputWord, int *outputWordLength); const uint8_t* const DICT_ROOT; const int MAX_WORD_LENGTH;