Merge missing space and mistyped space correction algorithm

Change-Id: Idd64d38d3d29be24748f9c0359667883698a5756
This commit is contained in:
satok 2012-01-26 22:49:13 +09:00
parent 5971a0a0bb
commit 9955716d0b
5 changed files with 93 additions and 124 deletions

View File

@ -158,10 +158,10 @@ void Correction::checkState() {
}
}
int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
const unsigned short *word) {
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
firstFreq, secondFreq, this, word);
int Correction::getFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
const bool isSpaceProximity, const unsigned short *word) {
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(freqArray, wordLengthArray, this,
isSpaceProximity, word);
}
int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
@ -806,21 +806,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
/* static */
int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
const int firstFreq, const int secondFreq, const Correction* correction,
const unsigned short *word) {
const int spaceProximityPos = correction->mSpaceProximityPos;
const int missingSpacePos = correction->mMissingSpacePos;
if (DEBUG_DICT) {
int inputCount = 0;
if (spaceProximityPos >= 0) ++inputCount;
if (missingSpacePos >= 0) ++inputCount;
assert(inputCount <= 1);
}
const bool isSpaceProximity = spaceProximityPos >= 0;
const int inputLength = correction->mInputLength;
const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
: (inputLength - missingSpacePos);
const int *freqArray, const int *wordLengthArray, const Correction* correction,
const bool isSpaceProximity, const unsigned short *word) {
const int firstFreq = freqArray[0];
const int secondFreq = freqArray[1];
const int firstWordLength = wordLengthArray[0];
const int secondWordLength = wordLengthArray[1];
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
bool firstCapitalizedWordDemotion = false;

View File

@ -122,7 +122,8 @@ class Correction {
bool needsToPrune() const;
int getFreqForSplitTwoWords(
const int firstFreq, const int secondFreq, const unsigned short *word);
const int *freqArray, const int *wordLengthArray, const bool isSpaceProximity,
const unsigned short *word);
int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
const int inputLength);
@ -150,8 +151,9 @@ class Correction {
static int calculateFinalFreq(const int inputIndex, const int depth,
const int freq, int *editDistanceTable, const Correction* correction,
const int inputLength);
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
const Correction* correction, const unsigned short *word);
static int calcFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
const Correction* correction, const bool isSpaceProximity,
const unsigned short *word);
static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
const unsigned short* after, const int afterLength, const int score);
static int editDistance(const unsigned short* before,

View File

@ -180,10 +180,9 @@ static void prof_out(void) {
#define CALIBRATE_SCORE_BY_TOUCH_COORDINATES true
#define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
#define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true
#define SUGGEST_MULTIPLE_WORDS true
// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80
@ -233,7 +232,7 @@ static void prof_out(void) {
// Minimum suggest depth for one word for all cases except for missing space suggestions.
#define MIN_SUGGEST_DEPTH 1
#define MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION 3
#define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
#define min(a,b) ((a)<(b)?(a):(b))

View File

@ -211,7 +211,6 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
PROF_END(3);
PROF_START(4);
// Note: This line is intentionally left blank
bool hasAutoCorrectionCandidate = false;
WordsPriorityQueue* masterQueue = queuePool->getMasterQueue();
if (masterQueue->size() > 0) {
@ -222,14 +221,14 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
PROF_END(4);
PROF_START(5);
// Suggestions with missing space
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
// Multiple word suggestions
if (SUGGEST_MULTIPLE_WORDS
&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
for (int i = 1; i < inputLength; ++i) {
if (DEBUG_DICT) {
AKLOGI("--- Suggest missing space characters %d", i);
AKLOGI("--- Suggest multiple words %d", i);
}
getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, inputLength, i, correction, queuePool,
hasAutoCorrectionCandidate);
}
@ -237,26 +236,9 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
PROF_END(5);
PROF_START(6);
if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
// The first and last "mistyped spaces" are taken care of by excessive character handling
for (int i = 1; i < inputLength - 1; ++i) {
if (DEBUG_DICT) {
AKLOGI("--- Suggest words with proximity space %d", i);
}
const int x = xcoordinates[i];
const int y = ycoordinates[i];
if (DEBUG_PROXIMITY_INFO) {
AKLOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
i, x, y, proximityInfo->hasSpaceProximity(x, y));
}
if (proximityInfo->hasSpaceProximity(x, y)) {
getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, inputLength, i, correction, queuePool,
hasAutoCorrectionCandidate);
}
}
}
// Note: This line is intentionally left blank
PROF_END(6);
if (DEBUG_DICT) {
queuePool->dumpSubQueue1TopSuggestions();
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
@ -337,24 +319,6 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
}
}
void UnigramDictionary::getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
const int inputLength, const int missingSpacePos, Correction *correction,
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) {
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */,
correction, queuePool, hasAutoCorrectionCandidate);
}
void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
const int inputLength, const int spaceProximityPos, Correction *correction,
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) {
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos,
correction, queuePool, hasAutoCorrectionCandidate);
}
inline void UnigramDictionary::onTerminal(const int freq,
const TerminalAttributes& terminalAttributes, Correction *correction,
WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
@ -405,15 +369,23 @@ inline void UnigramDictionary::onTerminal(const int freq,
}
}
int UnigramDictionary::getSubStringSuggestion(
bool UnigramDictionary::getSubStringSuggestion(
ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
const int *codes, const bool useFullEditDistance, Correction *correction,
WordsPriorityQueuePool* queuePool, const int inputLength,
const bool hasAutoCorrectionCandidate, const int currentWordIndex,
const int inputWordStartPos, const int inputWordLength,
const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength) {
const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) {
if (DEBUG_DICT) {
assert(currentWordIndex >= 1);
}
unsigned short* tempOutputWord = 0;
int tempOutputWordLength = 0;
// TODO: Optimize init suggestion
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
inputLength, correction);
int freq = getMostFrequentWordLike(
inputWordStartPos, inputWordLength, proximityInfo, mWord);
if (freq > 0) {
@ -438,7 +410,7 @@ int UnigramDictionary::getSubStringSuggestion(
}
WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength);
if (!queue || queue->size() < 1) {
return 0;
return false;
}
int score = 0;
const double ns = queue->getHighestNormalizedScore(
@ -451,91 +423,103 @@ int UnigramDictionary::getSubStringSuggestion(
// threshold.
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
|| tempOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
return 0;
return false;
}
freq = score >> (tempOutputWordLength
+ TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
}
if (DEBUG_DICT) {
AKLOGI("Freq(%d): %d", currentWordIndex, freq);
AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d"
, currentWordIndex, freq, tempOutputWordLength, inputWordLength, inputWordStartPos);
}
if (freq <= 0 || tempOutputWordLength <= 0
|| MAX_WORD_LENGTH <= (outputWordStartPos + tempOutputWordLength)) {
return 0;
return false;
}
for (int i = 0; i < tempOutputWordLength; ++i) {
outputWord[outputWordStartPos + i] = tempOutputWord[i];
}
// Put output values
freqArray[currentWordIndex - 1] = freq;
// TODO: put output length instead of input length
wordLengthArray[currentWordIndex - 1] = inputWordLength;
*outputWordLength = outputWordStartPos + tempOutputWordLength;
if ((inputWordStartPos + inputWordLength) < inputLength) {
if (outputWordStartPos + tempOutputWordLength >= MAX_WORD_LENGTH) {
return 0;
return false;
}
outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
++tempOutputWordLength;
++*outputWordLength;
} else if (currentWordIndex >= 2) {
// TODO: Handle 3 or more words
const int pairFreq = correction->getFreqForSplitTwoWords(
freqArray, wordLengthArray, isSpaceProximity, outputWord);
if (DEBUG_DICT) {
AKLOGI("Split two words: %d, %d, %d, %d", freqArray[0], freqArray[1], pairFreq,
inputLength);
}
addWord(outputWord, *outputWordLength, pairFreq, queuePool->getMasterQueue());
}
*outputWordLength = outputWordStartPos + tempOutputWordLength;
return freq;
return true;
}
void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool,
const bool useFullEditDistance, const int inputLength, const int wordDivideIndex,
Correction *correction, WordsPriorityQueuePool* queuePool,
const bool hasAutoCorrectionCandidate) {
if (inputLength >= MAX_WORD_LENGTH) return;
if (DEBUG_DICT) {
int inputCount = 0;
if (spaceProximityPos >= 0) ++inputCount;
if (missingSpacePos >= 0) ++inputCount;
assert(inputCount <= 1);
// MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16
assert(MAX_PROXIMITY_CHARS == 16);
}
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
inputLength, correction);
// Allocating fixed length array on stack
unsigned short outputWord[MAX_WORD_LENGTH];
int freqArray[SUB_QUEUE_MAX_WORD_INDEX];
int wordLengthArray[SUB_QUEUE_MAX_WORD_INDEX];
int outputWordLength = 0;
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
const bool isSpaceProximity = spaceProximityPos >= 0;
// First word
int inputWordStartPos = 0;
int inputWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
const int firstFreq = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
int inputWordLength = wordDivideIndex;
if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, outputWord, &outputWordLength);
if (firstFreq <= 0) {
FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
freqArray, wordLengthArray, outputWord, &outputWordLength)) {
return;
}
const int tempOutputWordLength = outputWordLength;
// Second word
inputWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
inputWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
: (inputLength - missingSpacePos);
const int secondFreq = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
// Missing space
inputWordStartPos = wordDivideIndex;
inputWordLength = inputLength - wordDivideIndex;
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, outputWordLength, outputWord,
&outputWordLength);
if (secondFreq <= 0) {
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
false /* missing space */, freqArray, wordLengthArray, outputWord, &outputWordLength);
// Mistyped space
++inputWordStartPos;
--inputWordLength;
if (inputWordLength <= 0) {
return;
}
// TODO: Remove initSuggestions and correction->setCorrectionParams
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
-1 /* transposedPos */, spaceProximityPos, missingSpacePos,
useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, outputWord);
if (DEBUG_DICT) {
AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
const int x = xcoordinates[inputWordStartPos - 1];
const int y = ycoordinates[inputWordStartPos - 1];
if (!proximityInfo->hasSpaceProximity(x, y)) {
return;
}
addWord(outputWord, outputWordLength, pairFreq, masterQueue);
return;
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
true /* mistyped space */, freqArray, wordLengthArray, outputWord, &outputWordLength);
}
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous

View File

@ -103,17 +103,9 @@ class UnigramDictionary {
const int currentWordIndex);
void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool,
const bool useFullEditDistance, const int inputLength, const int wordDivideIndex,
Correction *correction, WordsPriorityQueuePool* queuePool,
const bool hasAutoCorrectionCandidate);
void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
const int inputLength, const int missingSpacePos, Correction *correction,
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate);
void getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
const int inputLength, const int spaceProximityPos, Correction *correction,
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate);
void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
const int currentWordIndex);
@ -127,13 +119,14 @@ class UnigramDictionary {
ProximityInfo *proximityInfo, unsigned short *word);
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
short unsigned int *outWord);
int getSubStringSuggestion(
bool getSubStringSuggestion(
ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
const int *codes, const bool useFullEditDistance, Correction *correction,
WordsPriorityQueuePool* queuePool, const int inputLength,
const bool hasAutoCorrectionCandidate, const int currentWordIndex,
const int inputWordStartPos, const int inputWordLength,
const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength);
const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
int *wordLengthArray, unsigned short* outputWord, int *outputWordLength);
const uint8_t* const DICT_ROOT;
const int MAX_WORD_LENGTH;