From bcac0e9e23853891a5a45fd19b6f8917ffc705f7 Mon Sep 17 00:00:00 2001
From: satok <satok@google.com>
Date: Mon, 15 Aug 2011 22:30:33 +0900
Subject: [PATCH] Improve fat finger correction.

Change-Id: I37ed0dc6956f7e6fab5dcfd0483ab5691cd819d4
---
 native/src/correction.cpp | 202 +++++++++++++++++++-------------------
 native/src/correction.h   |   3 +-
 native/src/defines.h      |   3 +
 3 files changed, 103 insertions(+), 105 deletions(-)

diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index a4090a966..99412b211 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -95,10 +95,8 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen
     }
 
     *word = mWord;
-    const bool sameLength = (mExcessivePos == mInputLength - 1) ? (mInputLength == inputIndex + 2)
-            : (mInputLength == inputIndex + 1);
     return Correction::RankingAlgorithm::calculateFinalFreq(
-            inputIndex, outputIndex, freq, sameLength, mEditDistanceTable, this);
+            inputIndex, outputIndex, freq, mEditDistanceTable, this);
 }
 
 bool Correction::initProcessState(const int outputIndex) {
@@ -205,20 +203,6 @@ Correction::CorrectionType Correction::processCharAndCalcState(
     }
 
     if (mNeedsToTraverseAllNodes || isQuote(c)) {
-        const bool checkProximityChars =
-                !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
-        // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
-        // proximity chars of "s", but it should rather be handled as a skipped char.
-        if (checkProximityChars
-                && mInputIndex > 0
-                && mCorrectionStates[mOutputIndex].mProximityMatching
-                && mCorrectionStates[mOutputIndex].mSkipping
-                && mProximityInfo->getMatchedProximityId(
-                        mInputIndex - 1, c, false)
-                        == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
-            ++mSkippedCount;
-            --mProximityCount;
-        }
         return processSkipChar(c, isTerminal);
     } else {
         int inputIndexForProximity = mInputIndex;
@@ -250,6 +234,8 @@ Correction::CorrectionType Correction::processCharAndCalcState(
                     && mProximityInfo->getMatchedProximityId(
                             inputIndexForProximity - 1, c, false)
                                     == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
+                // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
+                // proximity chars of "s", but it should rather be handled as a skipped char.
                 ++mSkippedCount;
                 --mProximityCount;
                 return processSkipChar(c, isTerminal);
@@ -344,6 +330,16 @@ inline static void multiplyRate(const int rate, int *freq) {
     }
 }
 
+inline static int getQuoteCount(const unsigned short* word, const int length) {
+    int quoteCount = 0;
+    for (int i = 0; i < length; ++i) {
+        if(word[i] == '\'') {
+            ++quoteCount;
+        }
+    }
+    return quoteCount;
+}
+
 /* static */
 inline static int editDistance(
         int* editDistanceTable, const unsigned short* input,
@@ -392,8 +388,7 @@ inline static int editDistance(
 
 /* static */
 int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
-        const int freq, const bool sameLength, int* editDistanceTable,
-        const Correction* correction) {
+        const int freq, int* editDistanceTable, const Correction* correction) {
     const int excessivePos = correction->getExcessivePos();
     const int transposedPos = correction->getTransposedPos();
     const int inputLength = correction->mInputLength;
@@ -402,6 +397,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
     const ProximityInfo *proximityInfo = correction->mProximityInfo;
     const int skipCount = correction->mSkippedCount;
     const int proximityMatchedCount = correction->mProximityCount;
+    if (skipCount >= inputLength || inputLength == 0) {
+        return -1;
+    }
+    const bool sameLength = (excessivePos == inputLength - 1) ? (inputLength == inputIndex + 2)
+            : (inputLength == inputIndex + 1);
+
 
     // TODO: use mExcessiveCount
     int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0);
@@ -409,67 +410,52 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
     const unsigned short* word = correction->mWord;
     const bool skipped = skipCount > 0;
 
-    // ----- TODO: use edit distance here as follows? ---------------------- /
-    //if (!skipped && excessivePos < 0 && transposedPos < 0) {
-    //    const int ed = editDistance(dp, proximityInfo->getInputWord(),
-    //            inputLength, word, outputIndex + 1);
-    //    matchCount = outputIndex + 1 - ed;
-    //    if (ed == 1 && !sameLength) ++matchCount;
-    //}
-    //    const int ed = editDistance(dp, proximityInfo->getInputWord(),
-    //    inputLength, word, outputIndex + 1);
-    //    if (ed == 1 && !sameLength) ++matchCount; ------------------------ /
-    int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
+    const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
+            - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
+
+    // TODO: Calculate edit distance for transposed and excessive
+    int matchWeight;
+    int ed = 0;
+    int adJustedProximityMatchedCount = proximityMatchedCount;
+    if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) {
+        const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord();
+        ed = editDistance(editDistanceTable, primaryInputWord,
+                inputLength, word, outputIndex + 1);
+        matchWeight = powerIntCapped(typedLetterMultiplier, outputIndex + 1 - ed);
+        if (ed == 1 && inputLength == outputIndex) {
+            // Promote a word with just one skipped char
+            multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &matchWeight);
+        }
+        ed = max(0, ed - quoteDiffCount);
+        adJustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
+                proximityMatchedCount);
+    } else {
+        matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
+    }
 
     // TODO: Demote by edit distance
     int finalFreq = freq * matchWeight;
-    // +1 +11/-12
-    /*if (inputLength == outputIndex && !skipped && excessivePos < 0 && transposedPos < 0) {
-        const int ed = editDistance(dp, proximityInfo->getInputWord(),
-                inputLength, word, outputIndex + 1);
-        if (ed == 1) {
-            multiplyRate(160, &finalFreq);
-        }
-    }*/
-    if (inputLength == outputIndex && excessivePos < 0 && transposedPos < 0
-            && (proximityMatchedCount > 0 || skipped)) {
-        const int ed = editDistance(editDistanceTable, proximityInfo->getPrimaryInputWord(),
-                inputLength, word, outputIndex + 1);
-        if (ed == 1) {
-            multiplyRate(160, &finalFreq);
-        }
-    }
 
-    // TODO: Promote properly?
-    //if (skipCount == 1 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
-    //        && !sameLength) {
-    //    multiplyRate(150, &finalFreq);
-    //}
-    //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
-    //        && !sameLength) {
-    //    multiplyRate(150, &finalFreq);
-    //}
-    //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0
-    //        && inputLength == outputIndex + 1) {
-    //    multiplyRate(150, &finalFreq);
-    //}
+    ///////////////////////////////////////////////
+    // Promotion and Demotion for each correction
 
+    // Demotion for a word with missing character
     if (skipped) {
-        if (inputLength >= 2) {
-            const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
-                    * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
-                    / (10 * inputLength
-                            - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
-            if (DEBUG_DICT_FULL) {
-                LOGI("Demotion rate for missing character is %d.", demotionRate);
-            }
-            multiplyRate(demotionRate, &finalFreq);
-        } else {
-            finalFreq = 0;
+        const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
+                * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
+                / (10 * inputLength
+                        - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
+        if (DEBUG_DICT_FULL) {
+            LOGI("Demotion rate for missing character is %d.", demotionRate);
         }
+        multiplyRate(demotionRate, &finalFreq);
     }
+
+    // Demotion for a word with transposed character
     if (transposedPos >= 0) multiplyRate(
             WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
+
+    // Demotion for a word with excessive character
     if (excessivePos >= 0) {
         multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
         if (!proximityInfo->existsAdjacentProximityChars(inputIndex)) {
@@ -478,52 +464,62 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
             multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
         }
     }
-    int lengthFreq = typedLetterMultiplier;
-    multiplyIntCapped(powerIntCapped(typedLetterMultiplier, outputIndex), &lengthFreq);
-    if ((outputIndex + 1) == matchCount) {
-        // Full exact match
-        if (outputIndex > 1) {
-            if (DEBUG_DICT) {
-                LOGI("Found full matched word.");
-            }
-            multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
-        }
-        if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
-            finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
-        }
-    } else if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0
-            && outputIndex > 0) {
+
+    // Promotion for a word with proximity characters
+    for (int i = 0; i < adJustedProximityMatchedCount; ++i) {
         // A word with proximity corrections
-        if (DEBUG_DICT) {
-            LOGI("Found one proximity correction.");
+        if (DEBUG_DICT_FULL) {
+            LOGI("Found a proximity correction.");
         }
         multiplyIntCapped(typedLetterMultiplier, &finalFreq);
         multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
     }
-    if (DEBUG_DICT_FULL) {
-        LOGI("calc: %d, %d", outputIndex, sameLength);
-    }
-    if (sameLength) multiplyIntCapped(fullWordMultiplier, &finalFreq);
 
-    // TODO: check excessive count and transposed count
+    const int errorCount = proximityMatchedCount + skipCount;
+    multiplyRate(
+            100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
+
+    // Promotion for an exactly matched word
+    if (matchCount == outputIndex + 1) {
+        // Full exact match
+        if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
+            finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
+        }
+    }
+
+    // Promote a word with no correction
+    if (proximityMatchedCount == 0 && transposedPos < 0 && !skipped && excessivePos < 0) {
+        multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
+    }
+
+    // TODO: Check excessive count and transposed count
+    // TODO: Remove this if possible
     /*
-     If the last character of the user input word is the same as the next character
-     of the output word, and also all of characters of the user input are matched
-     to the output word, we'll promote that word a bit because
-     that word can be considered the combination of skipped and matched characters.
-     This means that the 'sm' pattern wins over the 'ma' pattern.
-     e.g.)
-     shel -> shell [mmmma] or [mmmsm]
-     hel -> hello [mmmaa] or [mmsma]
-     m ... matching
-     s ... skipping
-     a ... traversing all
+         If the last character of the user input word is the same as the next character
+         of the output word, and also all of characters of the user input are matched
+         to the output word, we'll promote that word a bit because
+         that word can be considered the combination of skipped and matched characters.
+         This means that the 'sm' pattern wins over the 'ma' pattern.
+         e.g.)
+         shel -> shell [mmmma] or [mmmsm]
+         hel -> hello [mmmaa] or [mmsma]
+         m ... matching
+         s ... skipping
+         a ... traversing all
      */
     if (matchCount == inputLength && matchCount >= 2 && !skipped
             && word[matchCount] == word[matchCount - 1]) {
         multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
     }
 
+    if (sameLength) {
+        multiplyIntCapped(fullWordMultiplier, &finalFreq);
+    }
+
+    if (DEBUG_DICT_FULL) {
+        LOGI("calc: %d, %d", outputIndex, sameLength);
+    }
+
     return finalFreq;
 }
 
diff --git a/native/src/correction.h b/native/src/correction.h
index 9d385a44e..871a04251 100644
--- a/native/src/correction.h
+++ b/native/src/correction.h
@@ -139,8 +139,7 @@ private:
     class RankingAlgorithm {
     public:
         static int calculateFinalFreq(const int inputIndex, const int depth,
-                const int freq, const bool sameLength, int *editDistanceTable,
-                const Correction* correction);
+                const int freq, int *editDistanceTable, const Correction* correction);
         static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
                 const Correction* correction);
     };
diff --git a/native/src/defines.h b/native/src/defines.h
index c1d08e695..a29fb7e5b 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -177,6 +177,8 @@ static void dumpWord(const unsigned short* word, const int length) {
 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120
 #define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90
 #define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105
+#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160
+#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 42
 
 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
@@ -194,5 +196,6 @@ static void dumpWord(const unsigned short* word, const int length) {
 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
 
 #define min(a,b) ((a)<(b)?(a):(b))
+#define max(a,b) ((a)>(b)?(a):(b))
 
 #endif // LATINIME_DEFINES_H