From 3edb62c69b76fb4619bd45cc4df49ee90664ce7b Mon Sep 17 00:00:00 2001 From: Yuichiro Hanada Date: Wed, 14 Aug 2013 19:03:09 +0900 Subject: [PATCH] Move some methods in BinaryDictIOUtils to DynamicBinaryDictIOUtils. Change-Id: I9ba55582c533fef0eb3e60c46bf23c8b16ee1ff4 --- .../latin/makedict/BinaryDictIOUtils.java | 463 +--------------- .../makedict/DynamicBinaryDictIOUtils.java | 493 ++++++++++++++++++ .../latin/makedict/BinaryDictIOTests.java | 4 +- .../makedict/BinaryDictIOUtilsTests.java | 6 +- 4 files changed, 504 insertions(+), 462 deletions(-) create mode 100644 java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index 7c7064e4b..877fff267 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -32,14 +32,12 @@ import java.io.IOException; import java.io.OutputStream; import java.nio.channels.FileChannel; import java.util.ArrayList; -import java.util.Arrays; import java.util.Iterator; import java.util.Map; import java.util.Stack; public final class BinaryDictIOUtils { private static final boolean DBG = false; - private static final int MAX_JUMPS = 10000; private BinaryDictIOUtils() { // This utility class is not publicly instantiable. @@ -246,36 +244,10 @@ public final class BinaryDictIOUtils { return FormatSpec.NOT_VALID_WORD; } - private static int markAsDeleted(final int flags) { - return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; - } - - /** - * Delete the word from the binary file. - * - * @param buffer the buffer to write. - * @param word the word we delete - * @throws IOException - * @throws UnsupportedFormatException - */ - @UsedForTesting - public static void deleteWord(final FusionDictionaryBufferInterface buffer, - final String word) throws IOException, UnsupportedFormatException { - buffer.position(0); - final FileHeader header = BinaryDictInputOutput.readHeader(buffer); - final int wordPosition = getTerminalPosition(buffer, word); - if (wordPosition == FormatSpec.NOT_VALID_WORD) return; - - buffer.position(wordPosition); - final int flags = buffer.readUnsignedByte(); - buffer.position(wordPosition); - buffer.put((byte)markAsDeleted(flags)); - } - /** * @return the size written, in bytes. Always 3 bytes. */ - private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer, + static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer, final int value) { final int absValue = Math.abs(value); buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); @@ -287,7 +259,7 @@ public final class BinaryDictIOUtils { /** * @return the size written, in bytes. Always 3 bytes. */ - private static int writeSInt24ToStream(final OutputStream destination, final int value) + static int writeSInt24ToStream(final OutputStream destination, final int value) throws IOException { final int absValue = Math.abs(value); destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); @@ -318,39 +290,7 @@ public final class BinaryDictIOUtils { return BinaryDictInputOutput.getByteSize(value); } - /** - * Update a parent address in a CharGroup that is referred to by groupOriginAddress. - * - * @param buffer the buffer to write. - * @param groupOriginAddress the address of the group. - * @param newParentAddress the absolute address of the parent. - * @param formatOptions file format options. - */ - public static void updateParentAddress(final FusionDictionaryBufferInterface buffer, - final int groupOriginAddress, final int newParentAddress, - final FormatOptions formatOptions) { - final int originalPosition = buffer.position(); - buffer.position(groupOriginAddress); - if (!formatOptions.mSupportsDynamicUpdate) { - throw new RuntimeException("this file format does not support parent addresses"); - } - final int flags = buffer.readUnsignedByte(); - if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) { - // if the group is moved, the parent address is stored in the destination group. - // We are guaranteed to process the destination group later, so there is no need to - // update anything here. - buffer.position(originalPosition); - return; - } - if (DBG) { - MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress); - } - final int parentOffset = newParentAddress - groupOriginAddress; - writeSInt24ToBuffer(buffer, parentOffset); - buffer.position(originalPosition); - } - - private static void skipCharGroup(final FusionDictionaryBufferInterface buffer, + static void skipCharGroup(final FusionDictionaryBufferInterface buffer, final FormatOptions formatOptions) { final int flags = buffer.readUnsignedByte(); BinaryDictInputOutput.readParentAddress(buffer, formatOptions); @@ -385,33 +325,7 @@ public final class BinaryDictIOUtils { } } - /** - * Update parent addresses in a Node that is referred to by nodeOriginAddress. - * - * @param buffer the buffer to be modified. - * @param nodeOriginAddress the address of a modified Node. - * @param newParentAddress the address to be written. - * @param formatOptions file format options. - */ - public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer, - final int nodeOriginAddress, final int newParentAddress, - final FormatOptions formatOptions) { - final int originalPosition = buffer.position(); - buffer.position(nodeOriginAddress); - do { - final int count = BinaryDictInputOutput.readCharGroupCount(buffer); - for (int i = 0; i < count; ++i) { - updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions); - skipCharGroup(buffer, formatOptions); - } - final int forwardLinkAddress = buffer.readUnsignedInt24(); - buffer.position(forwardLinkAddress); - } while (formatOptions.mSupportsDynamicUpdate - && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); - buffer.position(originalPosition); - } - - private static void skipString(final FusionDictionaryBufferInterface buffer, + static void skipString(final FusionDictionaryBufferInterface buffer, final boolean hasMultipleChars) { if (hasMultipleChars) { int character = CharEncoding.readChar(buffer); @@ -452,29 +366,6 @@ public final class BinaryDictIOUtils { return size; } - /** - * Update a children address in a CharGroup that is addressed by groupOriginAddress. - * - * @param buffer the buffer to write. - * @param groupOriginAddress the address of the group. - * @param newChildrenAddress the absolute address of the child. - * @param formatOptions file format options. - */ - public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer, - final int groupOriginAddress, final int newChildrenAddress, - final FormatOptions formatOptions) { - final int originalPosition = buffer.position(); - buffer.position(groupOriginAddress); - final int flags = buffer.readUnsignedByte(); - final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions); - skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); - if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); - final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS - ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position(); - writeSInt24ToBuffer(buffer, childrenOffset); - buffer.position(originalPosition); - } - /** * Write a char group to an output stream. * A char group is an in-memory representation of a node in trie. @@ -567,52 +458,10 @@ public final class BinaryDictIOUtils { return size; } - @SuppressWarnings("unused") - private static void updateForwardLink(final FusionDictionaryBufferInterface buffer, - final int nodeOriginAddress, final int newNodeAddress, - final FormatOptions formatOptions) { - buffer.position(nodeOriginAddress); - int jumpCount = 0; - while (jumpCount++ < MAX_JUMPS) { - final int count = BinaryDictInputOutput.readCharGroupCount(buffer); - for (int i = 0; i < count; ++i) skipCharGroup(buffer, formatOptions); - final int forwardLinkAddress = buffer.readUnsignedInt24(); - if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { - buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE); - writeSInt24ToBuffer(buffer, newNodeAddress); - return; - } - buffer.position(forwardLinkAddress); - } - if (DBG && jumpCount >= MAX_JUMPS) { - throw new RuntimeException("too many jumps, probably a bug."); - } - } - - /** - * Helper method to move a char group to the tail of the file. - */ - private static int moveCharGroup(final OutputStream destination, - final FusionDictionaryBufferInterface buffer, final CharGroupInfo info, - final int nodeOriginAddress, final int oldGroupAddress, - final FormatOptions formatOptions) throws IOException { - updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions); - buffer.position(oldGroupAddress); - final int currentFlags = buffer.readUnsignedByte(); - buffer.position(oldGroupAddress); - buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags - & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); - int size = FormatSpec.GROUP_FLAGS_SIZE; - updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions); - size += writeNode(destination, new CharGroupInfo[] { info }); - return size; - } - /** * Compute the size of the char group. */ - private static int computeGroupSize(final CharGroupInfo info, - final FormatOptions formatOptions) { + static int computeGroupSize(final CharGroupInfo info, final FormatOptions formatOptions) { int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE + BinaryDictInputOutput.getGroupCharactersSize(info.mCharacters) + BinaryDictInputOutput.getChildrenAddressSize(info.mFlags, formatOptions); @@ -639,7 +488,7 @@ public final class BinaryDictIOUtils { * @return the size written, in bytes. * @throws IOException */ - private static int writeNode(final OutputStream destination, final CharGroupInfo[] infos) + static int writeNode(final OutputStream destination, final CharGroupInfo[] infos) throws IOException { int size = BinaryDictInputOutput.getGroupCountSize(infos.length); switch (BinaryDictInputOutput.getGroupCountSize(infos.length)) { @@ -658,306 +507,6 @@ public final class BinaryDictIOUtils { return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; } - /** - * Move a group that is referred to by oldGroupOrigin to the tail of the file. - * And set the children address to the byte after the group. - * - * @param nodeOrigin the address of the tail of the file. - * @param characters - * @param length - * @param flags - * @param frequency - * @param parentAddress - * @param shortcutTargets - * @param bigrams - * @param destination the stream representing the tail of the file. - * @param buffer the buffer representing the (constant-size) body of the file. - * @param oldNodeOrigin - * @param oldGroupOrigin - * @param formatOptions - * @return the size written, in bytes. - * @throws IOException - */ - private static int moveGroup(final int nodeOrigin, final int[] characters, final int length, - final int flags, final int frequency, final int parentAddress, - final ArrayList shortcutTargets, - final ArrayList bigrams, final OutputStream destination, - final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin, - final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException { - int size = 0; - final int newGroupOrigin = nodeOrigin + 1; - final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length); - final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */, - flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS, - shortcutTargets, bigrams); - size = computeGroupSize(tmpInfo, formatOptions); - final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size, - flags, writtenCharacters, frequency, parentAddress, - nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets, - bigrams); - moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions); - return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; - } - - /** - * Insert a word into a binary dictionary. - * - * @param buffer - * @param destination - * @param word - * @param frequency - * @param bigramStrings - * @param shortcuts - * @throws IOException - * @throws UnsupportedFormatException - */ - // TODO: Support batch insertion. - // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. - @UsedForTesting - public static void insertWord(final FusionDictionaryBufferInterface buffer, - final OutputStream destination, final String word, final int frequency, - final ArrayList bigramStrings, - final ArrayList shortcuts, final boolean isNotAWord, - final boolean isBlackListEntry) - throws IOException, UnsupportedFormatException { - final ArrayList bigrams = new ArrayList(); - if (bigramStrings != null) { - for (final WeightedString bigram : bigramStrings) { - int position = getTerminalPosition(buffer, bigram.mWord); - if (position == FormatSpec.NOT_VALID_WORD) { - // TODO: figure out what is the correct thing to do here. - } else { - bigrams.add(new PendingAttribute(bigram.mFrequency, position)); - } - } - } - - final boolean isTerminal = true; - final boolean hasBigrams = !bigrams.isEmpty(); - final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty(); - - // find the insert position of the word. - if (buffer.position() != 0) buffer.position(0); - final FileHeader header = BinaryDictInputOutput.readHeader(buffer); - - int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position(); - final int[] codePoints = FusionDictionary.getCodePoints(word); - final int wordLen = codePoints.length; - - for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { - if (wordPos >= wordLen) break; - nodeOriginAddress = buffer.position(); - int nodeParentAddress = -1; - final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); - boolean foundNextGroup = false; - - for (int i = 0; i < charGroupCount; ++i) { - address = buffer.position(); - final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, - buffer.position(), header.mFormatOptions); - final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, - header.mFormatOptions); - if (isMovedGroup) continue; - nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) - ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address; - boolean matched = true; - for (int p = 0; p < currentInfo.mCharacters.length; ++p) { - if (wordPos + p >= wordLen) { - /* - * splitting - * before - * abcd - ef - * - * insert "abc" - * - * after - * abc - d - ef - */ - final int newNodeAddress = buffer.limit(); - final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, - isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */, - false /* isBlackListEntry */, header.mFormatOptions); - int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags, - frequency, nodeParentAddress, shortcuts, bigrams, destination, - buffer, nodeOriginAddress, address, header.mFormatOptions); - - final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p, - currentInfo.mCharacters.length); - if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { - updateParentAddresses(buffer, currentInfo.mChildrenAddress, - newNodeAddress + written + 1, header.mFormatOptions); - } - final CharGroupInfo newInfo2 = new CharGroupInfo( - newNodeAddress + written + 1, -1 /* endAddress */, - currentInfo.mFlags, characters2, currentInfo.mFrequency, - newNodeAddress + 1, currentInfo.mChildrenAddress, - currentInfo.mShortcutTargets, currentInfo.mBigrams); - writeNode(destination, new CharGroupInfo[] { newInfo2 }); - return; - } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) { - if (p > 0) { - /* - * splitting - * before - * ab - cd - * - * insert "ac" - * - * after - * a - b - cd - * | - * - c - */ - - final int newNodeAddress = buffer.limit(); - final int childrenAddress = currentInfo.mChildrenAddress; - - // move prefix - final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, - false /* isTerminal */, 0 /* childrenAddressSize*/, - false /* hasShortcut */, false /* hasBigrams */, - false /* isNotAWord */, false /* isBlackListEntry */, - header.mFormatOptions); - int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, - prefixFlags, -1 /* frequency */, nodeParentAddress, null, null, - destination, buffer, nodeOriginAddress, address, - header.mFormatOptions); - - final int[] suffixCharacters = Arrays.copyOfRange( - currentInfo.mCharacters, p, currentInfo.mCharacters.length); - if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { - updateParentAddresses(buffer, currentInfo.mChildrenAddress, - newNodeAddress + written + 1, header.mFormatOptions); - } - final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags( - suffixCharacters.length > 1, - (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0, - 0 /* childrenAddressSize */, - (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) - != 0, - (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0, - isNotAWord, isBlackListEntry, header.mFormatOptions); - final CharGroupInfo suffixInfo = new CharGroupInfo( - newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags, - suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1, - currentInfo.mChildrenAddress, currentInfo.mShortcutTargets, - currentInfo.mBigrams); - written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1; - - final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p, - codePoints.length); - final int flags = BinaryDictInputOutput.makeCharGroupFlags( - newCharacters.length > 1, isTerminal, - 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, - isNotAWord, isBlackListEntry, header.mFormatOptions); - final CharGroupInfo newInfo = new CharGroupInfo( - newNodeAddress + written, -1 /* endAddress */, flags, - newCharacters, frequency, newNodeAddress + 1, - FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); - writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo }); - return; - } - matched = false; - break; - } - } - - if (matched) { - if (wordPos + currentInfo.mCharacters.length == wordLen) { - // the word exists in the dictionary. - // only update group. - final int newNodeAddress = buffer.limit(); - final boolean hasMultipleChars = currentInfo.mCharacters.length > 1; - final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, - isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, - isNotAWord, isBlackListEntry, header.mFormatOptions); - final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, - -1 /* endAddress */, flags, currentInfo.mCharacters, frequency, - nodeParentAddress, currentInfo.mChildrenAddress, shortcuts, - bigrams); - moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address, - header.mFormatOptions); - return; - } - wordPos += currentInfo.mCharacters.length; - if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { - /* - * found the prefix of the word. - * make new node and link to the node from this group. - * - * before - * ab - cd - * - * insert "abcde" - * - * after - * ab - cd - e - */ - final int newNodeAddress = buffer.limit(); - updateChildrenAddress(buffer, address, newNodeAddress, - header.mFormatOptions); - final int newGroupAddress = newNodeAddress + 1; - final boolean hasMultipleChars = (wordLen - wordPos) > 1; - final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, - isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, - isNotAWord, isBlackListEntry, header.mFormatOptions); - final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); - final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags, - characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS, - shortcuts, bigrams); - writeNode(destination, new CharGroupInfo[] { newInfo }); - return; - } - buffer.position(currentInfo.mChildrenAddress); - foundNextGroup = true; - break; - } - } - - if (foundNextGroup) continue; - - // reached the end of the array. - final int linkAddressPosition = buffer.position(); - int nextLink = buffer.readUnsignedInt24(); - if ((nextLink & FormatSpec.MSB24) != 0) { - nextLink = -(nextLink & FormatSpec.SINT24_MAX); - } - if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) { - /* - * expand this node. - * - * before - * ab - cd - * - * insert "abef" - * - * after - * ab - cd - * | - * - ef - */ - - // change the forward link address. - final int newNodeAddress = buffer.limit(); - buffer.position(linkAddressPosition); - writeSInt24ToBuffer(buffer, newNodeAddress); - - final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); - final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1, - isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, - isNotAWord, isBlackListEntry, header.mFormatOptions); - final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, - -1 /* endAddress */, flags, characters, frequency, nodeParentAddress, - FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); - writeNode(destination, new CharGroupInfo[]{ newInfo }); - return; - } else { - depth--; - buffer.position(nextLink); - } - } - } - /** * Find a word from the buffer. * diff --git a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java new file mode 100644 index 000000000..fd728f1d7 --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java @@ -0,0 +1,493 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; +import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * The utility class to help dynamic updates on the binary dictionary. + * + * All the methods in this class are static. + */ +@UsedForTesting +public final class DynamicBinaryDictIOUtils { + private static final boolean DBG = false; + private static final int MAX_JUMPS = 10000; + + private DynamicBinaryDictIOUtils() { + // This utility class is not publicly instantiable. + } + + private static int markAsDeleted(final int flags) { + return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; + } + + /** + * Delete the word from the binary file. + * + * @param buffer the buffer to write. + * @param word the word we delete + * @throws IOException + * @throws UnsupportedFormatException + */ + @UsedForTesting + public static void deleteWord(final FusionDictionaryBufferInterface buffer, + final String word) throws IOException, UnsupportedFormatException { + buffer.position(0); + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + final int wordPosition = BinaryDictIOUtils.getTerminalPosition(buffer, word); + if (wordPosition == FormatSpec.NOT_VALID_WORD) return; + + buffer.position(wordPosition); + final int flags = buffer.readUnsignedByte(); + buffer.position(wordPosition); + buffer.put((byte)markAsDeleted(flags)); + } + + /** + * Update a parent address in a CharGroup that is referred to by groupOriginAddress. + * + * @param buffer the buffer to write. + * @param groupOriginAddress the address of the group. + * @param newParentAddress the absolute address of the parent. + * @param formatOptions file format options. + */ + public static void updateParentAddress(final FusionDictionaryBufferInterface buffer, + final int groupOriginAddress, final int newParentAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(groupOriginAddress); + if (!formatOptions.mSupportsDynamicUpdate) { + throw new RuntimeException("this file format does not support parent addresses"); + } + final int flags = buffer.readUnsignedByte(); + if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) { + // if the group is moved, the parent address is stored in the destination group. + // We are guaranteed to process the destination group later, so there is no need to + // update anything here. + buffer.position(originalPosition); + return; + } + if (DBG) { + MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress); + } + final int parentOffset = newParentAddress - groupOriginAddress; + BinaryDictIOUtils.writeSInt24ToBuffer(buffer, parentOffset); + buffer.position(originalPosition); + } + + /** + * Update parent addresses in a Node that is referred to by nodeOriginAddress. + * + * @param buffer the buffer to be modified. + * @param nodeOriginAddress the address of a modified Node. + * @param newParentAddress the address to be written. + * @param formatOptions file format options. + */ + public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer, + final int nodeOriginAddress, final int newParentAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(nodeOriginAddress); + do { + final int count = BinaryDictInputOutput.readCharGroupCount(buffer); + for (int i = 0; i < count; ++i) { + updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions); + BinaryDictIOUtils.skipCharGroup(buffer, formatOptions); + } + final int forwardLinkAddress = buffer.readUnsignedInt24(); + buffer.position(forwardLinkAddress); + } while (formatOptions.mSupportsDynamicUpdate + && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); + buffer.position(originalPosition); + } + + /** + * Update a children address in a CharGroup that is addressed by groupOriginAddress. + * + * @param buffer the buffer to write. + * @param groupOriginAddress the address of the group. + * @param newChildrenAddress the absolute address of the child. + * @param formatOptions file format options. + */ + public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer, + final int groupOriginAddress, final int newChildrenAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(groupOriginAddress); + final int flags = buffer.readUnsignedByte(); + final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions); + BinaryDictIOUtils.skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS + ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position(); + BinaryDictIOUtils.writeSInt24ToBuffer(buffer, childrenOffset); + buffer.position(originalPosition); + } + + /** + * Helper method to move a char group to the tail of the file. + */ + private static int moveCharGroup(final OutputStream destination, + final FusionDictionaryBufferInterface buffer, final CharGroupInfo info, + final int nodeOriginAddress, final int oldGroupAddress, + final FormatOptions formatOptions) throws IOException { + updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions); + buffer.position(oldGroupAddress); + final int currentFlags = buffer.readUnsignedByte(); + buffer.position(oldGroupAddress); + buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags + & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); + int size = FormatSpec.GROUP_FLAGS_SIZE; + updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions); + size += BinaryDictIOUtils.writeNode(destination, new CharGroupInfo[] { info }); + return size; + } + @SuppressWarnings("unused") + private static void updateForwardLink(final FusionDictionaryBufferInterface buffer, + final int nodeOriginAddress, final int newNodeAddress, + final FormatOptions formatOptions) { + buffer.position(nodeOriginAddress); + int jumpCount = 0; + while (jumpCount++ < MAX_JUMPS) { + final int count = BinaryDictInputOutput.readCharGroupCount(buffer); + for (int i = 0; i < count; ++i) BinaryDictIOUtils.skipCharGroup(buffer, formatOptions); + final int forwardLinkAddress = buffer.readUnsignedInt24(); + if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + BinaryDictIOUtils.writeSInt24ToBuffer(buffer, newNodeAddress); + return; + } + buffer.position(forwardLinkAddress); + } + if (DBG && jumpCount >= MAX_JUMPS) { + throw new RuntimeException("too many jumps, probably a bug."); + } + } + + /** + * Move a group that is referred to by oldGroupOrigin to the tail of the file. + * And set the children address to the byte after the group. + * + * @param nodeOrigin the address of the tail of the file. + * @param characters + * @param length + * @param flags + * @param frequency + * @param parentAddress + * @param shortcutTargets + * @param bigrams + * @param destination the stream representing the tail of the file. + * @param buffer the buffer representing the (constant-size) body of the file. + * @param oldNodeOrigin + * @param oldGroupOrigin + * @param formatOptions + * @return the size written, in bytes. + * @throws IOException + */ + private static int moveGroup(final int nodeOrigin, final int[] characters, final int length, + final int flags, final int frequency, final int parentAddress, + final ArrayList shortcutTargets, + final ArrayList bigrams, final OutputStream destination, + final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin, + final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException { + int size = 0; + final int newGroupOrigin = nodeOrigin + 1; + final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length); + final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */, + flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS, + shortcutTargets, bigrams); + size = BinaryDictIOUtils.computeGroupSize(tmpInfo, formatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size, + flags, writtenCharacters, frequency, parentAddress, + nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets, + bigrams); + moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions); + return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; + } + + /** + * Insert a word into a binary dictionary. + * + * @param buffer + * @param destination + * @param word + * @param frequency + * @param bigramStrings + * @param shortcuts + * @throws IOException + * @throws UnsupportedFormatException + */ + // TODO: Support batch insertion. + // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. + @UsedForTesting + public static void insertWord(final FusionDictionaryBufferInterface buffer, + final OutputStream destination, final String word, final int frequency, + final ArrayList bigramStrings, + final ArrayList shortcuts, final boolean isNotAWord, + final boolean isBlackListEntry) + throws IOException, UnsupportedFormatException { + final ArrayList bigrams = new ArrayList(); + if (bigramStrings != null) { + for (final WeightedString bigram : bigramStrings) { + int position = BinaryDictIOUtils.getTerminalPosition(buffer, bigram.mWord); + if (position == FormatSpec.NOT_VALID_WORD) { + // TODO: figure out what is the correct thing to do here. + } else { + bigrams.add(new PendingAttribute(bigram.mFrequency, position)); + } + } + } + + final boolean isTerminal = true; + final boolean hasBigrams = !bigrams.isEmpty(); + final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty(); + + // find the insert position of the word. + if (buffer.position() != 0) buffer.position(0); + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + + int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position(); + final int[] codePoints = FusionDictionary.getCodePoints(word); + final int wordLen = codePoints.length; + + for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { + if (wordPos >= wordLen) break; + nodeOriginAddress = buffer.position(); + int nodeParentAddress = -1; + final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); + boolean foundNextGroup = false; + + for (int i = 0; i < charGroupCount; ++i) { + address = buffer.position(); + final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, + buffer.position(), header.mFormatOptions); + final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, + header.mFormatOptions); + if (isMovedGroup) continue; + nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) + ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address; + boolean matched = true; + for (int p = 0; p < currentInfo.mCharacters.length; ++p) { + if (wordPos + p >= wordLen) { + /* + * splitting + * before + * abcd - ef + * + * insert "abc" + * + * after + * abc - d - ef + */ + final int newNodeAddress = buffer.limit(); + final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, + isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */, + false /* isBlackListEntry */, header.mFormatOptions); + int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags, + frequency, nodeParentAddress, shortcuts, bigrams, destination, + buffer, nodeOriginAddress, address, header.mFormatOptions); + + final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p, + currentInfo.mCharacters.length); + if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + updateParentAddresses(buffer, currentInfo.mChildrenAddress, + newNodeAddress + written + 1, header.mFormatOptions); + } + final CharGroupInfo newInfo2 = new CharGroupInfo( + newNodeAddress + written + 1, -1 /* endAddress */, + currentInfo.mFlags, characters2, currentInfo.mFrequency, + newNodeAddress + 1, currentInfo.mChildrenAddress, + currentInfo.mShortcutTargets, currentInfo.mBigrams); + BinaryDictIOUtils.writeNode(destination, new CharGroupInfo[] { newInfo2 }); + return; + } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) { + if (p > 0) { + /* + * splitting + * before + * ab - cd + * + * insert "ac" + * + * after + * a - b - cd + * | + * - c + */ + + final int newNodeAddress = buffer.limit(); + final int childrenAddress = currentInfo.mChildrenAddress; + + // move prefix + final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, + false /* isTerminal */, 0 /* childrenAddressSize*/, + false /* hasShortcut */, false /* hasBigrams */, + false /* isNotAWord */, false /* isBlackListEntry */, + header.mFormatOptions); + int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, + prefixFlags, -1 /* frequency */, nodeParentAddress, null, null, + destination, buffer, nodeOriginAddress, address, + header.mFormatOptions); + + final int[] suffixCharacters = Arrays.copyOfRange( + currentInfo.mCharacters, p, currentInfo.mCharacters.length); + if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + updateParentAddresses(buffer, currentInfo.mChildrenAddress, + newNodeAddress + written + 1, header.mFormatOptions); + } + final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags( + suffixCharacters.length > 1, + (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0, + 0 /* childrenAddressSize */, + (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) + != 0, + (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo suffixInfo = new CharGroupInfo( + newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags, + suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1, + currentInfo.mChildrenAddress, currentInfo.mShortcutTargets, + currentInfo.mBigrams); + written += BinaryDictIOUtils.computeGroupSize(suffixInfo, + header.mFormatOptions) + 1; + + final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p, + codePoints.length); + final int flags = BinaryDictInputOutput.makeCharGroupFlags( + newCharacters.length > 1, isTerminal, + 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo( + newNodeAddress + written, -1 /* endAddress */, flags, + newCharacters, frequency, newNodeAddress + 1, + FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); + BinaryDictIOUtils.writeNode(destination, + new CharGroupInfo[] { suffixInfo, newInfo }); + return; + } + matched = false; + break; + } + } + + if (matched) { + if (wordPos + currentInfo.mCharacters.length == wordLen) { + // the word exists in the dictionary. + // only update group. + final int newNodeAddress = buffer.limit(); + final boolean hasMultipleChars = currentInfo.mCharacters.length > 1; + final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, + -1 /* endAddress */, flags, currentInfo.mCharacters, frequency, + nodeParentAddress, currentInfo.mChildrenAddress, shortcuts, + bigrams); + moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address, + header.mFormatOptions); + return; + } + wordPos += currentInfo.mCharacters.length; + if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { + /* + * found the prefix of the word. + * make new node and link to the node from this group. + * + * before + * ab - cd + * + * insert "abcde" + * + * after + * ab - cd - e + */ + final int newNodeAddress = buffer.limit(); + updateChildrenAddress(buffer, address, newNodeAddress, + header.mFormatOptions); + final int newGroupAddress = newNodeAddress + 1; + final boolean hasMultipleChars = (wordLen - wordPos) > 1; + final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); + final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags, + characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS, + shortcuts, bigrams); + BinaryDictIOUtils.writeNode(destination, new CharGroupInfo[] { newInfo }); + return; + } + buffer.position(currentInfo.mChildrenAddress); + foundNextGroup = true; + break; + } + } + + if (foundNextGroup) continue; + + // reached the end of the array. + final int linkAddressPosition = buffer.position(); + int nextLink = buffer.readUnsignedInt24(); + if ((nextLink & FormatSpec.MSB24) != 0) { + nextLink = -(nextLink & FormatSpec.SINT24_MAX); + } + if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + /* + * expand this node. + * + * before + * ab - cd + * + * insert "abef" + * + * after + * ab - cd + * | + * - ef + */ + + // change the forward link address. + final int newNodeAddress = buffer.limit(); + buffer.position(linkAddressPosition); + BinaryDictIOUtils.writeSInt24ToBuffer(buffer, newNodeAddress); + + final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); + final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, + -1 /* endAddress */, flags, characters, frequency, nodeParentAddress, + FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); + BinaryDictIOUtils.writeNode(destination, new CharGroupInfo[]{ newInfo }); + return; + } else { + depth--; + buffer.position(nextLink); + } + } + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOTests.java index e4b5ad279..7bfd6032a 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOTests.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOTests.java @@ -619,13 +619,13 @@ public class BinaryDictIOTests extends AndroidTestCase { try { MoreAsserts.assertNotEqual(FormatSpec.NOT_VALID_WORD, BinaryDictIOUtils.getTerminalPosition(buffer, sWords.get(0))); - BinaryDictIOUtils.deleteWord(buffer, sWords.get(0)); + DynamicBinaryDictIOUtils.deleteWord(buffer, sWords.get(0)); assertEquals(FormatSpec.NOT_VALID_WORD, BinaryDictIOUtils.getTerminalPosition(buffer, sWords.get(0))); MoreAsserts.assertNotEqual(FormatSpec.NOT_VALID_WORD, BinaryDictIOUtils.getTerminalPosition(buffer, sWords.get(5))); - BinaryDictIOUtils.deleteWord(buffer, sWords.get(5)); + DynamicBinaryDictIOUtils.deleteWord(buffer, sWords.get(5)); assertEquals(FormatSpec.NOT_VALID_WORD, BinaryDictIOUtils.getTerminalPosition(buffer, sWords.get(5))); } catch (IOException e) { diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtilsTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtilsTests.java index 9331da44b..e75950703 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtilsTests.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtilsTests.java @@ -197,8 +197,8 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase { assertEquals(FormatSpec.NOT_VALID_WORD, getWordPosition(file, word)); } final long now = System.nanoTime(); - BinaryDictIOUtils.insertWord(buffer, outStream, word, frequency, bigrams, shortcuts, - false, false); + DynamicBinaryDictIOUtils.insertWord(buffer, outStream, word, frequency, bigrams, + shortcuts, false, false); amountOfTime = System.nanoTime() - now; outStream.flush(); MoreAsserts.assertNotEqual(FormatSpec.NOT_VALID_WORD, getWordPosition(file, word)); @@ -232,7 +232,7 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase { raFile = new RandomAccessFile(file, "rw"); buffer = new ByteBufferWrapper(raFile.getChannel().map( FileChannel.MapMode.READ_WRITE, 0, file.length())); - BinaryDictIOUtils.deleteWord(buffer, word); + DynamicBinaryDictIOUtils.deleteWord(buffer, word); } catch (IOException e) { } catch (UnsupportedFormatException e) { } finally {