Promote full matches with differing accents.

Stop considering accented characters as different from their base
character for proximity scoring.
Also give a huge boost (basically overriding frequency) to a word
fully matched with only differing accents.

Bug: 2550587

Change-Id: I2da7a71229fb3868d9e4a53703ccf8caeb6fcf10
This commit is contained in:
Jean Chalard 2011-01-27 14:20:22 +09:00
parent 588d2a525c
commit 8dc754a411
3 changed files with 39 additions and 21 deletions

View File

@ -129,6 +129,7 @@ static void prof_out(void) {
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
@ -136,6 +137,9 @@ static void prof_out(void) {
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
// This is used as a bare multiplier (not subject to /100)
#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
#define MAX_WORD_LENGTH_INTERNAL 48

View File

@ -363,9 +363,14 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
}
int lengthFreq = TYPED_LETTER_MULTIPLIER;
for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER;
if (depth > 1 && lengthFreq == snr) {
if (DEBUG_DICT) LOGI("Found full matched word.");
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
if (lengthFreq == snr) {
if (depth > 1) {
if (DEBUG_DICT) LOGI("Found full matched word.");
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
}
}
if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
return finalFreq;
@ -385,10 +390,9 @@ inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLe
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
unsigned short *word, const int inputIndex, const int depth, const int snr,
const int skipPos, const int excessivePos, const int transposedPos, const int freq,
const int addedWeight) {
const int skipPos, const int excessivePos, const int transposedPos, const int freq) {
if (sameAsTyped(word, depth + 1)) return;
const int finalFreq = calculateFinalFreq(inputIndex, depth, snr * addedWeight, skipPos,
const int finalFreq = calculateFinalFreq(inputIndex, depth, snr, skipPos,
excessivePos, transposedPos, freq, true);
// Proximity collection will promote a word of the same length as what user typed.
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
@ -424,9 +428,9 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
return false;
}
inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
const unsigned short c, const int skipPos, const int excessivePos,
const int transposedPos) {
inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
const int *currentChars, const unsigned short c, const int skipPos,
const int excessivePos, const int transposedPos) {
const unsigned short lowerC = toLowerCase(c);
int j = 0;
while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
@ -434,18 +438,19 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
// If skipPos is defined, not to search proximity collections.
// First char is what user typed.
if (matched) {
return j;
if (j > 0) return NEAR_PROXIMITY_CHAR;
return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
} else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
// Not to check proximity characters
return -1;
return UNRELATED_CHAR;
}
++j;
}
return -1;
return UNRELATED_CHAR;
}
inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
const int maxDepth, const bool traverseAllNodes, int snr, int inputIndex,
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
@ -492,22 +497,24 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
transposedPos);
if (matchedProximityCharId < 0) return false;
if (UNRELATED_CHAR == matchedProximityCharId) return false;
mWord[depth] = c;
// If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity.
const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;
if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
snr = snr * TYPED_LETTER_MULTIPLIER;
}
bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
if (isSameAsUserTypedLength && terminal) {
onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, snr,
skipPos, excessivePos, transposedPos, freq, addedWeight);
skipPos, excessivePos, transposedPos, freq);
}
if (!needsToTraverseChildrenNodes) return false;
// Start traversing all nodes after the index exceeds the user typed length
*newTraverseAllNodes = isSameAsUserTypedLength;
*newSnr = snr * addedWeight;
*newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);
*newSnr = snr;
*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
*newInputIndex = inputIndex + 1;
}
// Optimization: Prune out words that are too long compared to how much was typed.

View File

@ -22,6 +22,13 @@
namespace latinime {
class UnigramDictionary {
typedef enum { // Used as a return value for character comparison
SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR, // Same char, possibly with different case or accent
NEAR_PROXIMITY_CHAR, // It is a char located nearby on the keyboard
UNRELATED_CHAR // It is an unrelated char
} ProximityType;
public:
UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
@ -60,11 +67,11 @@ private:
const int transposedPos, const int freq);
void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word,
const int inputIndex, const int depth, const int snr, const int skipPos,
const int excessivePos, const int transposedPos, const int freq, const int addedWeight);
const int excessivePos, const int transposedPos, const int freq);
bool needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth);
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos,
const int excessivePos, const int transposedPos);
ProximityType getMatchedProximityId(const int *currentChars, const unsigned short c,
const int skipPos, const int excessivePos, const int transposedPos);
// Process a node by considering proximity, missing and excessive character
bool processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,