Read shortcuts as strings in the dictionary.

This has no impact on performance.
Before:
(0)  9.61 (0.01%)
(1)  57514.58 (56.70%)
(2)  10.55 (0.01%)
(3)  10.79 (0.01%)
(4)  133.20 (0.13%)
(5)  43553.87 (42.94%)
(6)  10.03 (0.01%)
(20) 47.20 (0.05%)
Total 101431.47 (sum of others 101289.84)

After:
(0)  10.52 (0.01%)
(1)  56311.16 (56.66%)
(2)  13.40 (0.01%)
(3)  10.98 (0.01%)
(4)  136.72 (0.14%)
(5)  42707.92 (42.97%)
(6)  9.79 (0.01%)
(20) 51.35 (0.05%)
Total 99390.76 (sum of others 99251.84)

The difference is not significant with regard to measure imprecision

Change-Id: I2e4f1ef7a5e99082e67dd27f56cf4fc432bb48fa
This commit is contained in:
Jean Chalard 2012-03-27 19:56:23 +09:00
parent 7540fd009d
commit 9a933a742d
5 changed files with 50 additions and 27 deletions

View File

@ -123,6 +123,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
}
pos = BinaryFormat::skipChildrenPosition(flags, pos);
pos = BinaryFormat::skipFrequency(flags, pos);
pos = BinaryFormat::skipShortcuts(root, flags, pos);
int bigramFlags;
int bigramCount = 0;
do {

View File

@ -40,6 +40,9 @@ class BinaryFormat {
// implementations. On this occasion, we made the magic number 32 bits long.
const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
const static int SHORTCUT_LIST_SIZE_SIZE = 2;
static int detectFormat(const uint8_t* const dict);
static unsigned int getHeaderSize(const uint8_t* const dict);
static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
@ -47,9 +50,10 @@ class BinaryFormat {
static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
static int skipOtherCharacters(const uint8_t* const dict, const int pos);
static int skipAttributes(const uint8_t* const dict, const int pos);
static int skipChildrenPosition(const uint8_t flags, const int pos);
static int skipFrequency(const uint8_t flags, const int pos);
static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
const int pos);
@ -157,12 +161,12 @@ static inline int attributeAddressSize(const uint8_t flags) {
*/
}
inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
int currentPos = pos;
uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
currentPos += attributeAddressSize(flags);
flags = getFlagsAndForwardPointer(dict, &currentPos);
flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
}
currentPos += attributeAddressSize(flags);
return currentPos;
@ -174,6 +178,10 @@ static inline int childrenAddressSize(const uint8_t flags) {
/* See the note in attributeAddressSize. The same applies here */
}
static inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
}
inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
return pos + childrenAddressSize(flags);
}
@ -182,16 +190,30 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}
inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
const int pos) {
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
return pos + shortcutByteSize(dict, pos);
} else {
return pos;
}
}
inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
const int pos) {
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
return skipExistingBigrams(dict, pos);
} else {
return pos;
}
}
inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
const int pos) {
// This function skips all attributes: shortcuts and bigrams.
int newPos = pos;
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
newPos = skipAttributes(dict, newPos);
}
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
newPos = skipAttributes(dict, newPos);
}
newPos = skipShortcuts(dict, flags, newPos);
newPos = skipBigrams(dict, flags, newPos);
return newPos;
}

View File

@ -45,13 +45,19 @@ class TerminalAttributes {
// Gets the shortcut target itself as a uint16_t string. For parameters and return value
// see BinaryFormat::getWordAtAddress.
// TODO: make the output an uint32_t* to handle the whole unicode range.
inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) {
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
mHasNextShortcutTarget =
0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
int shortcutAddress =
BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos);
return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord);
unsigned int i;
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
if (NOT_A_CHARACTER == charCode) break;
outWord[i] = (uint16_t)charCode;
}
mPos += BinaryFormat::CHARACTER_ARRAY_TERMINATOR_SIZE;
return i;
}
};
@ -65,12 +71,10 @@ class TerminalAttributes {
mDict(dict), mFlags(flags), mStartPos(pos) {
}
inline bool isShortcutOnly() const {
return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY);
}
inline ShortcutIterator getShortcutIterator() const {
return ShortcutIterator(mDict, mStartPos, mFlags);
// The size of the shortcuts is stored here so that the whole shortcut chunk can be
// skipped quickly, so we ignore it.
return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
}
};
} // namespace latinime

View File

@ -366,10 +366,9 @@ inline void UnigramDictionary::onTerminal(const int freq,
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
if (finalFreq != NOT_A_FREQUENCY) {
if (!terminalAttributes.isShortcutOnly()) {
addWord(wordPointer, wordLength, finalFreq, masterQueue);
}
addWord(wordPointer, wordLength, finalFreq, masterQueue);
const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0;
// Please note that the shortcut candidates will be added to the master queue only.
TerminalAttributes::ShortcutIterator iterator =
terminalAttributes.getShortcutIterator();
@ -379,11 +378,12 @@ inline void UnigramDictionary::onTerminal(const int freq,
// We need to either modulate the frequency of each shortcut according
// to its own shortcut frequency or to make the queue
// so that the insert order is protected inside the queue for words
// with the same score.
// with the same score. For the moment we use -1 to make sure the shortcut will
// never be in front of the word.
uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
MAX_WORD_LENGTH_INTERNAL, shortcutTarget);
addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue);
addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue);
}
}
}

View File

@ -49,10 +49,6 @@ class UnigramDictionary {
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
// Flag for bigram presence
static const int FLAG_HAS_BIGRAMS = 0x04;
// Flag for shortcut-only words. Some words are shortcut-only, which means they match when
// the user types them but they don't pop in the suggestion strip, only the words they are
// shortcuts for do.
static const int FLAG_IS_SHORTCUT_ONLY = 0x02;
// Attribute (bigram/shortcut) related flags:
// Flag for presence of more attributes