From 1f6b52e76c59700984fe2b7d7b436d81da997e93 Mon Sep 17 00:00:00 2001
From: satok <satok@google.com>
Date: Mon, 30 Jan 2012 13:53:58 +0900
Subject: [PATCH] Implement multi words suggestions step1

Change-Id: I96e8e1b0d9ccc0ed13d53c40300d8c19bcb7af5b
---
 native/src/correction.cpp              |  11 +-
 native/src/defines.h                   |   6 +-
 native/src/unigram_dictionary.cpp      | 177 ++++++++++++++-----------
 native/src/unigram_dictionary.h        |   9 +-
 native/src/words_priority_queue_pool.h |  39 ++----
 5 files changed, 132 insertions(+), 110 deletions(-)

diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index ee5023532..7323747d7 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -827,11 +827,6 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
     const bool capitalizedWordDemotion =
             firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
 
-    if (DEBUG_DICT_FULL) {
-        AKLOGI("Two words: %c, %c, %d",
-                word[0], word[firstWordLength + 1], capitalizedWordDemotion);
-    }
-
     if (firstWordLength == 0 || secondWordLength == 0) {
         return 0;
     }
@@ -891,6 +886,12 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
         multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
     }
 
+    if (DEBUG_CORRECTION_FREQ) {
+        AKLOGI("Two words (%d, %d) (%d, %d) %d, %d", firstFreq, secondFreq, firstWordLength,
+                secondWordLength, capitalizedWordDemotion, totalFreq);
+        DUMP_WORD(word, firstWordLength);
+    }
+
     return totalFreq;
 }
 
diff --git a/native/src/defines.h b/native/src/defines.h
index c25f963e0..3f3f5ba5c 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -216,15 +216,15 @@ static void prof_out(void) {
 #define SUB_QUEUE_MAX_WORDS 1
 #define SUB_QUEUE_MAX_COUNT 10
 #define SUB_QUEUE_MIN_WORD_LENGTH 4
-#define SUB_QUEUE_MAX_WORD_INDEX 2
+#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 2
 
 #define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
 #define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
 
 #define MAX_DEPTH_MULTIPLIER 3
 
-#define FIRST_WORD_INDEX 1
-#define SECOND_WORD_INDEX 2
+#define FIRST_WORD_INDEX 0
+#define SECOND_WORD_INDEX 1
 
 // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
 // word in the dictionary
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 8b1a25d90..597e5c821 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -224,14 +224,9 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
     // Multiple word suggestions
     if (SUGGEST_MULTIPLE_WORDS
             && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
-        for (int i = 1; i < inputLength; ++i) {
-            if (DEBUG_DICT) {
-                AKLOGI("--- Suggest multiple words %d", i);
-            }
-            getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
-                    useFullEditDistance, inputLength, i, correction, queuePool,
-                    hasAutoCorrectionCandidate);
-        }
+        getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
+                useFullEditDistance, inputLength, correction, queuePool,
+                hasAutoCorrectionCandidate);
     }
     PROF_END(5);
 
@@ -329,7 +324,7 @@ inline void UnigramDictionary::onTerminal(const int freq,
     int wordLength;
     unsigned short* wordPointer;
 
-    if ((currentWordIndex == 1) && addToMasterQueue) {
+    if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) {
         WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
         const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
         if (finalFreq != NOT_A_FREQUENCY) {
@@ -377,11 +372,8 @@ bool UnigramDictionary::getSubStringSuggestion(
         const int inputWordStartPos, const int inputWordLength,
         const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
         int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) {
-    if (DEBUG_DICT) {
-        assert(currentWordIndex >= 1);
-    }
     unsigned short* tempOutputWord = 0;
-    int tempOutputWordLength = 0;
+    int nextWordLength = 0;
     // TODO: Optimize init suggestion
     initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
             inputLength, correction);
@@ -389,7 +381,7 @@ bool UnigramDictionary::getSubStringSuggestion(
     int freq = getMostFrequentWordLike(
             inputWordStartPos, inputWordLength, proximityInfo, mWord);
     if (freq > 0) {
-        tempOutputWordLength = inputWordLength;
+        nextWordLength = inputWordLength;
         tempOutputWord = mWord;
     } else if (!hasAutoCorrectionCandidate) {
         if (inputWordStartPos > 0) {
@@ -400,7 +392,7 @@ bool UnigramDictionary::getSubStringSuggestion(
             getSuggestionCandidates(useFullEditDistance, inputWordLength, correction,
                     queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
             if (DEBUG_DICT) {
-                if (currentWordIndex <= SUB_QUEUE_MAX_WORD_INDEX) {
+                if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
                     AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength);
                     for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
                         queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord();
@@ -415,59 +407,122 @@ bool UnigramDictionary::getSubStringSuggestion(
         int score = 0;
         const double ns = queue->getHighestNormalizedScore(
                 proximityInfo->getPrimaryInputWord(), inputWordLength,
-                &tempOutputWord, &score, &tempOutputWordLength);
+                &tempOutputWord, &score, &nextWordLength);
         if (DEBUG_DICT) {
             AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score);
         }
         // Two words correction won't be done if the score of the first word doesn't exceed the
         // threshold.
         if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
-                || tempOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
+                || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
             return false;
         }
-        freq = score >> (tempOutputWordLength
-                + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
+        freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
     }
     if (DEBUG_DICT) {
-        AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d"
-                , currentWordIndex, freq, tempOutputWordLength, inputWordLength, inputWordStartPos);
+        AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)"
+                , currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos,
+                wordLengthArray[0]);
     }
-    if (freq <= 0 || tempOutputWordLength <= 0
-            || MAX_WORD_LENGTH <= (outputWordStartPos + tempOutputWordLength)) {
+    if (freq <= 0 || nextWordLength <= 0
+            || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) {
         return false;
     }
-    for (int i = 0; i < tempOutputWordLength; ++i) {
+    for (int i = 0; i < nextWordLength; ++i) {
         outputWord[outputWordStartPos + i] = tempOutputWord[i];
     }
 
     // Put output values
-    freqArray[currentWordIndex - 1] = freq;
+    freqArray[currentWordIndex] = freq;
     // TODO: put output length instead of input length
-    wordLengthArray[currentWordIndex - 1] = inputWordLength;
-    *outputWordLength = outputWordStartPos + tempOutputWordLength;
+    wordLengthArray[currentWordIndex] = inputWordLength;
+    const int tempOutputWordLength = outputWordStartPos + nextWordLength;
+    if (outputWordLength) {
+        *outputWordLength = tempOutputWordLength;
+    }
 
     if ((inputWordStartPos + inputWordLength) < inputLength) {
-        if (outputWordStartPos + tempOutputWordLength >= MAX_WORD_LENGTH) {
+        if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
             return false;
         }
         outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
-        ++*outputWordLength;
-    } else if (currentWordIndex >= 2) {
+        if (outputWordLength) {
+            ++*outputWordLength;
+        }
+    } else if (currentWordIndex >= 1) {
         // TODO: Handle 3 or more words
         const int pairFreq = correction->getFreqForSplitTwoWords(
                 freqArray, wordLengthArray, isSpaceProximity, outputWord);
         if (DEBUG_DICT) {
-            AKLOGI("Split two words: %d, %d, %d, %d", freqArray[0], freqArray[1], pairFreq,
-                    inputLength);
+            AKLOGI("Split two words: %d, %d, %d, %d, (%d)", freqArray[0], freqArray[1], pairFreq,
+                    inputLength, wordLengthArray[0]);
         }
-        addWord(outputWord, *outputWordLength, pairFreq, queuePool->getMasterQueue());
+        addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue());
     }
     return true;
 }
 
+void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
+        const int *xcoordinates, const int *ycoordinates, const int *codes,
+        const bool useFullEditDistance, const int inputLength,
+        Correction *correction, WordsPriorityQueuePool* queuePool,
+        const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex,
+        const int outputWordLength, int *freqArray, int* wordLengthArray,
+        unsigned short* outputWord) {
+    if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) {
+        // Return if the last word index
+        return;
+    }
+    for (int i = 1; i < inputLength; ++i) {
+        int tempOutputWordLength = 0;
+        // First word
+        int inputWordStartPos = 0;
+        int inputWordLength = i;
+        if (DEBUG_CORRECTION_FREQ) {
+            AKLOGI("Two words, %d", inputWordLength);
+        }
+        if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
+                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
+                FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
+                freqArray, wordLengthArray, outputWord, &tempOutputWordLength)) {
+            continue;
+        }
+
+        // Second word
+        // Missing space
+        inputWordStartPos = i;
+        inputWordLength = inputLength - i;
+        getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
+                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
+                SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
+                false /* missing space */, freqArray, wordLengthArray, outputWord,
+                0);
+
+        // Mistyped space
+        ++inputWordStartPos;
+        --inputWordLength;
+
+        if (inputWordLength <= 0) {
+            continue;
+        }
+
+        const int x = xcoordinates[inputWordStartPos - 1];
+        const int y = ycoordinates[inputWordStartPos - 1];
+        if (!proximityInfo->hasSpaceProximity(x, y)) {
+            continue;
+        }
+
+        getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
+                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
+                SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
+                true /* mistyped space */, freqArray, wordLengthArray, outputWord,
+                0);
+    }
+}
+
 void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
         const int *xcoordinates, const int *ycoordinates, const int *codes,
-        const bool useFullEditDistance, const int inputLength, const int wordDivideIndex,
+        const bool useFullEditDistance, const int inputLength,
         Correction *correction, WordsPriorityQueuePool* queuePool,
         const bool hasAutoCorrectionCandidate) {
     if (inputLength >= MAX_WORD_LENGTH) return;
@@ -475,51 +530,21 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
         // MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16
         assert(MAX_PROXIMITY_CHARS == 16);
     }
+    if (DEBUG_DICT) {
+        AKLOGI("--- Suggest multiple words");
+    }
 
     // Allocating fixed length array on stack
     unsigned short outputWord[MAX_WORD_LENGTH];
-    int freqArray[SUB_QUEUE_MAX_WORD_INDEX];
-    int wordLengthArray[SUB_QUEUE_MAX_WORD_INDEX];
-    int outputWordLength = 0;
-
-    // First word
-    int inputWordStartPos = 0;
-    int inputWordLength = wordDivideIndex;
-    if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
-            useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
-            FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
-            freqArray, wordLengthArray, outputWord, &outputWordLength)) {
-        return;
-    }
-
-    const int tempOutputWordLength = outputWordLength;
-    // Second word
-    // Missing space
-    inputWordStartPos = wordDivideIndex;
-    inputWordLength = inputLength - wordDivideIndex;
-    getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
-            useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
-            SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
-            false /* missing space */, freqArray, wordLengthArray, outputWord, &outputWordLength);
-
-    // Mistyped space
-    ++inputWordStartPos;
-    --inputWordLength;
-
-    if (inputWordLength <= 0) {
-        return;
-    }
-
-    const int x = xcoordinates[inputWordStartPos - 1];
-    const int y = ycoordinates[inputWordStartPos - 1];
-    if (!proximityInfo->hasSpaceProximity(x, y)) {
-        return;
-    }
-
-    getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
-            useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
-            SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
-            true /* mistyped space */, freqArray, wordLengthArray, outputWord, &outputWordLength);
+    int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
+    int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
+    const int outputWordLength = 0;
+    const int startInputPos = 0;
+    const int startWordIndex = 0;
+    getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
+            useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate,
+            startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray,
+            outputWord);
 }
 
 // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 79793d676..2d5d076b1 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -103,7 +103,7 @@ class UnigramDictionary {
             const int currentWordIndex);
     void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
             const int *xcoordinates, const int *ycoordinates, const int *codes,
-            const bool useFullEditDistance, const int inputLength, const int wordDivideIndex,
+            const bool useFullEditDistance, const int inputLength,
             Correction *correction, WordsPriorityQueuePool* queuePool,
             const bool hasAutoCorrectionCandidate);
     void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
@@ -127,6 +127,13 @@ class UnigramDictionary {
             const int inputWordStartPos, const int inputWordLength,
             const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
             int *wordLengthArray, unsigned short* outputWord, int *outputWordLength);
+    void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
+            const int *xcoordinates, const int *ycoordinates, const int *codes,
+            const bool useFullEditDistance, const int inputLength,
+            Correction *correction, WordsPriorityQueuePool* queuePool,
+            const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex,
+            const int outputWordLength, int *freqArray, int* wordLengthArray,
+            unsigned short* outputWord);
 
     const uint8_t* const DICT_ROOT;
     const int MAX_WORD_LENGTH;
diff --git a/native/src/words_priority_queue_pool.h b/native/src/words_priority_queue_pool.h
index a4aa8b6ca..5b50e8f4f 100644
--- a/native/src/words_priority_queue_pool.h
+++ b/native/src/words_priority_queue_pool.h
@@ -27,11 +27,10 @@ class WordsPriorityQueuePool {
  public:
     WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) {
         mMasterQueue = new(mMasterQueueBuf) WordsPriorityQueue(mainQueueMaxWords, maxWordLength);
-        for (int i = 0, subQueueBufOffset = 0; i < SUB_QUEUE_MAX_COUNT;
+        for (int i = 0, subQueueBufOffset = 0;
+                i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS * SUB_QUEUE_MAX_COUNT;
                 ++i, subQueueBufOffset += sizeof(WordsPriorityQueue)) {
-            mSubQueues1[i] = new(mSubQueueBuf1 + subQueueBufOffset)
-                    WordsPriorityQueue(subQueueMaxWords, maxWordLength);
-            mSubQueues2[i] = new(mSubQueueBuf2 + subQueueBufOffset)
+            mSubQueues[i] = new(mSubQueueBuf + subQueueBufOffset)
                     WordsPriorityQueue(subQueueMaxWords, maxWordLength);
         }
     }
@@ -44,7 +43,7 @@ class WordsPriorityQueuePool {
     }
 
     WordsPriorityQueue* getSubQueue(const int wordIndex, const int inputWordLength) {
-        if (wordIndex > SUB_QUEUE_MAX_WORD_INDEX) {
+        if (wordIndex >= MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
             return 0;
         }
         if (inputWordLength < 0 || inputWordLength >= SUB_QUEUE_MAX_COUNT) {
@@ -53,30 +52,21 @@ class WordsPriorityQueuePool {
             }
             return 0;
         }
-        // TODO: Come up with more generic pool
-        if (wordIndex == 1) {
-            return mSubQueues1[inputWordLength];
-        } else if (wordIndex == 2) {
-            return mSubQueues2[inputWordLength];
-        } else {
-            return 0;
-        }
+        return mSubQueues[wordIndex * SUB_QUEUE_MAX_COUNT + inputWordLength];
     }
 
     inline void clearAll() {
         mMasterQueue->clear();
-        for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
-            mSubQueues1[i]->clear();
-            mSubQueues2[i]->clear();
+        for (int i = 0; i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS; ++i) {
+            clearSubQueue(i);
         }
     }
 
     inline void clearSubQueue(const int wordIndex) {
         for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
-            if (wordIndex == 1) {
-                mSubQueues1[i]->clear();
-            } else if (wordIndex == 2) {
-                mSubQueues2[i]->clear();
+            WordsPriorityQueue* queue = getSubQueue(wordIndex, i);
+            if (queue) {
+                queue->clear();
             }
         }
     }
@@ -84,17 +74,16 @@ class WordsPriorityQueuePool {
     void dumpSubQueue1TopSuggestions() {
         AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS");
         for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
-            mSubQueues1[i]->dumpTopWord();
+            getSubQueue(0, i)->dumpTopWord();
         }
     }
 
  private:
     WordsPriorityQueue* mMasterQueue;
-    WordsPriorityQueue* mSubQueues1[SUB_QUEUE_MAX_COUNT];
-    WordsPriorityQueue* mSubQueues2[SUB_QUEUE_MAX_COUNT];
+    WordsPriorityQueue* mSubQueues[SUB_QUEUE_MAX_COUNT * MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
     char mMasterQueueBuf[sizeof(WordsPriorityQueue)];
-    char mSubQueueBuf1[SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)];
-    char mSubQueueBuf2[SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)];
+    char mSubQueueBuf[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS
+                      * SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)];
 };
 }