Merge "Extend unigram probability field to support historical info."

This commit is contained in:
Keisuke Kuroyanagi 2013-11-29 07:29:53 +00:00 committed by Android (Google) Code Review
commit 5c48f1970a
8 changed files with 208 additions and 48 deletions

View File

@ -127,7 +127,8 @@ bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNo
ptNodeParams->getHeadPos(), writingPos));
mValidPtNodeCount++;
// Writes current PtNode.
return mPtNodeWriter->writePtNodeAndAdvancePosition(ptNodeParams, &writingPos);
return mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(ptNodeParams,
0 /* timestamp */, &writingPos);
}
bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields

View File

@ -16,55 +16,72 @@
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
int ProbabilityDictContent::getProbability(const int terminalId) const {
void ProbabilityDictContent::getProbabilityEntry(const int terminalId,
ProbabilityEntry *const outProbabilityEntry) const {
if (terminalId < 0 || terminalId >= mSize) {
return NOT_A_PROBABILITY;
outProbabilityEntry->setProbability(0 /* flags */, NOT_A_PROBABILITY);
AKLOGE("Terminal id (%d) is not in the probability dict content. mSize: %d", terminalId,
mSize);
return;
}
const BufferWithExtendableBuffer *const buffer = getBuffer();
int entryPos = getEntryPos(terminalId);
const int flags = buffer->readUintAndAdvancePosition(
Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &entryPos);
const int probability = buffer->readUintAndAdvancePosition(
Ver4DictConstants::PROBABILITY_SIZE, &entryPos);
if (mHasHistoricalInfo) {
const int timestamp = buffer->readUintAndAdvancePosition(
Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &entryPos);
const int level = buffer->readUintAndAdvancePosition(
Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos);
const int count = buffer->readUintAndAdvancePosition(
Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos);
outProbabilityEntry->setProbabilityWithHistricalInfo(flags, probability, timestamp, level,
count);
} else {
outProbabilityEntry->setProbability(flags, probability);
}
const int probabilityFieldPos =
getEntryPos(terminalId) + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE;
return getBuffer()->readUint(Ver4DictConstants::PROBABILITY_SIZE, probabilityFieldPos);
}
bool ProbabilityDictContent::setProbability(const int terminalId, const int probability) {
bool ProbabilityDictContent::setProbabilityEntry(const int terminalId,
const ProbabilityEntry *const probabilityEntry) {
if (terminalId < 0) {
return false;
}
const int entryPos = getEntryPos(terminalId);
if (terminalId >= mSize) {
ProbabilityEntry dummyEntry;
// Write new entry.
int writingPos = getBuffer()->getTailPosition();
while (writingPos <= getEntryPos(terminalId)) {
const int dummyFlags = 0;
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyFlags,
Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) {
return false;
}
const int dummyProbability = 0;
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyProbability,
Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
while (writingPos <= entryPos) {
// Fulfilling with dummy entries until writingPos.
if (!writeEntry(&dummyEntry, writingPos)) {
AKLOGE("Cannot write dummy entry. pos: %d, mSize: %d", writingPos, mSize);
return false;
}
writingPos += getEntrySize();
mSize++;
}
}
const int probabilityWritingPos = getEntryPos(terminalId)
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE;
return getWritableBuffer()->writeUint(probability,
Ver4DictConstants::PROBABILITY_SIZE, probabilityWritingPos);
return writeEntry(probabilityEntry, entryPos);
}
bool ProbabilityDictContent::flushToFile(const char *const dictDirPath) const {
if (getEntryPos(mSize) < getBuffer()->getTailPosition()) {
ProbabilityDictContent probabilityDictContentToWrite;
ProbabilityDictContent probabilityDictContentToWrite(mHasHistoricalInfo);
ProbabilityEntry probabilityEntry;
for (int i = 0; i < mSize; ++i) {
if (!probabilityDictContentToWrite.setProbability(i, getProbability(i))) {
getProbabilityEntry(i, &probabilityEntry);
if (!probabilityDictContentToWrite.setProbabilityEntry(i, &probabilityEntry)) {
AKLOGE("Cannot set probability entry in flushToFile. terminalId: %d", i);
return false;
}
}
@ -79,10 +96,12 @@ bool ProbabilityDictContent::runGC(
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
const ProbabilityDictContent *const originalProbabilityDictContent) {
mSize = 0;
ProbabilityEntry probabilityEntry;
for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
it != terminalIdMap->end(); ++it) {
if (!setProbability(it->second,
originalProbabilityDictContent->getProbability(it->first))) {
originalProbabilityDictContent->getProbabilityEntry(it->first, &probabilityEntry);
if (!setProbabilityEntry(it->second, &probabilityEntry)) {
AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second);
return false;
}
mSize++;
@ -90,9 +109,55 @@ bool ProbabilityDictContent::runGC(
return true;
}
int ProbabilityDictContent::getEntrySize() const {
if (mHasHistoricalInfo) {
return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
+ Ver4DictConstants::PROBABILITY_SIZE
+ Ver4DictConstants::TIME_STAMP_FIELD_SIZE
+ Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
+ Ver4DictConstants::WORD_COUNT_FIELD_SIZE;
} else {
return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
+ Ver4DictConstants::PROBABILITY_SIZE;
}
}
int ProbabilityDictContent::getEntryPos(const int terminalId) const {
return terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
+ Ver4DictConstants::PROBABILITY_SIZE);
return terminalId * getEntrySize();
}
bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry,
const int entryPos) {
BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer();
int writingPos = entryPos;
if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(),
Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) {
AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos);
return false;
}
if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(),
Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos);
return false;
}
if (mHasHistoricalInfo) {
if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getTimeStamp(),
Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) {
AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos);
return false;
}
if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getLevel(),
Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) {
AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos);
return false;
}
if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getCount(),
Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) {
AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos);
return false;
}
}
return true;
}
} // namespace latinime

View File

@ -25,18 +25,23 @@
namespace latinime {
class ProbabilityEntry;
class ProbabilityDictContent : public SingleDictContent {
public:
ProbabilityDictContent(const char *const dictDirPath, const bool isUpdatable)
ProbabilityDictContent(const char *const dictDirPath, const bool hasHistoricalInfo,
const bool isUpdatable)
: SingleDictContent(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable),
mSize(getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE)) {}
mHasHistoricalInfo(hasHistoricalInfo),
mSize(getBuffer()->getTailPosition() / getEntrySize()) {}
ProbabilityDictContent() : mSize(0) {}
ProbabilityDictContent(const bool hasHistoricalInfo)
: mHasHistoricalInfo(hasHistoricalInfo), mSize(0) {}
int getProbability(const int terminalId) const;
void getProbabilityEntry(const int terminalId,
ProbabilityEntry *const outProbabilityEntry) const;
bool setProbability(const int terminalId, const int probability);
bool setProbabilityEntry(const int terminalId, const ProbabilityEntry *const probabilityEntry);
bool flushToFile(const char *const dictDirPath) const;
@ -46,8 +51,13 @@ class ProbabilityDictContent : public SingleDictContent {
private:
DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent);
int getEntrySize() const;
int getEntryPos(const int terminalId) const;
bool writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos);
bool mHasHistoricalInfo;
int mSize;
};
} // namespace latinime

View File

@ -0,0 +1,78 @@
/*
* Copyright (C) 2013, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_PROBABILITY_ENTRY_H
#define LATINIME_PROBABILITY_ENTRY_H
#include "defines.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
namespace latinime {
class ProbabilityEntry {
public:
ProbabilityEntry()
: mFlags(0), mProbability(NOT_A_PROBABILITY),
mTimestamp(Ver4DictConstants::NOT_A_TIME_STAMP), mLevel(0), mCount(0) {}
void setProbability(const int flags, const int probability) {
mFlags = flags;
mProbability = probability;
mTimestamp = Ver4DictConstants::NOT_A_TIME_STAMP;
mLevel = 0;
mCount = 0;
}
void setProbabilityWithHistricalInfo(const int flags, const int probability,
const int timestamp, const int level, const int count) {
mFlags = flags;
mProbability = probability;
mTimestamp = timestamp;
mLevel = level;
mCount = count;
}
int getFlags() const {
return mFlags;
}
int getProbability() const {
return mProbability;
}
int getTimeStamp() const {
return mTimestamp;
}
int getLevel() const {
return mLevel;
}
int getCount() const {
return mCount;
}
private:
DISALLOW_COPY_AND_ASSIGN(ProbabilityEntry);
int mFlags;
int mProbability;
int mTimestamp;
int mLevel;
int mCount;
};
} // namespace latinime
#endif /* LATINIME_PROBABILITY_ENTRY_H */

View File

@ -126,7 +126,7 @@ class Ver4DictBuffers {
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
// TODO: Quit using header size.
mTerminalPositionLookupTable(dictDirPath, isUpdatable, mHeaderSize),
mProbabilityDictContent(dictDirPath, isUpdatable),
mProbabilityDictContent(dictDirPath, false /* hasHistoricalInfo */, isUpdatable),
mBigramDictContent(dictDirPath, isUpdatable),
mShortcutDictContent(dictDirPath, isUpdatable),
mIsUpdatable(isUpdatable) {}
@ -135,7 +135,8 @@ class Ver4DictBuffers {
: mDictBuffer(0), mHeaderSize(0),
mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
mExpandableTrieBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
mTerminalPositionLookupTable(), mProbabilityDictContent(),
mTerminalPositionLookupTable(),
mProbabilityDictContent(false /* hasHistoricalInfo */),
mBigramDictContent(), mShortcutDictContent(), mIsUpdatable(true) {}
const MmappedBuffer::MmappedBufferPtr mDictBuffer;

View File

@ -45,6 +45,8 @@ const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0;
const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4;
const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1;
const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4;

View File

@ -19,6 +19,7 @@
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
@ -59,7 +60,9 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
terminalIdFieldPos += mBuffer->getOriginalBufferSize();
}
terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos);
probability = mProbabilityDictContent->getProbability(terminalId);
ProbabilityEntry probabilityEntry;
mProbabilityDictContent->getProbabilityEntry(terminalId, &probabilityEntry);
probability = probabilityEntry.getProbability();
}
int childrenPosFieldPos = pos;
if (usesAdditionalBuffer) {

View File

@ -19,6 +19,7 @@
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h"
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_utils.h"
@ -119,8 +120,12 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability(
}
const int probabilityToWrite = getUpdatedProbability(toBeUpdatedPtNodeParams->getProbability(),
newProbability);
return mBuffers->getUpdatableProbabilityDictContent()->setProbability(
toBeUpdatedPtNodeParams->getTerminalId(), probabilityToWrite);
ProbabilityEntry probabilityEntry;
mBuffers->getProbabilityDictContent()->getProbabilityEntry(
toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry);
probabilityEntry.setProbability(probabilityEntry.getFlags(), probabilityToWrite);
return mBuffers->getUpdatableProbabilityDictContent()->setProbabilityEntry(
toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry);
}
bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition(
@ -153,8 +158,10 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
// Write probability.
const int probabilityToWrite = getUpdatedProbability(NOT_A_PROBABILITY,
ptNodeParams->getProbability());
return mBuffers->getUpdatableProbabilityDictContent()->setProbability(terminalId,
probabilityToWrite);
ProbabilityEntry probabilityEntry;
probabilityEntry.setProbability(0 /* flags */, probabilityToWrite);
return mBuffers->getUpdatableProbabilityDictContent()->setProbabilityEntry(terminalId,
&probabilityEntry);
}
bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry(
@ -258,13 +265,6 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) {
return false;
}
// Write probability.
if (ptNodeParams->getProbability() != NOT_A_PROBABILITY) {
if (!mBuffers->getUpdatableProbabilityDictContent()->setProbability(
terminalId, ptNodeParams->getProbability())) {
return false;
}
}
if (outTerminalId) {
*outTerminalId = terminalId;
}