From f6b0e32df38da4e2130bdbfc8875ea2d19054caf Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 21 Oct 2014 17:31:00 +0900 Subject: [PATCH] Add a *FAST* dictionary header reader. It's still unused as of this change but the next change will use it As a reference point, generating the metadata for Bayo takes 3'02" on my machine with the info command; it's down to 16" if made to use this instead. The gains increases with the number of dictionaries obviously. Change-Id: I0eeea2d8f81bb74b0d1570af658e91b56f7c2b79 --- .../makedict/BinaryDictDecoderUtils.java | 42 +++++++++++++++ .../dicttool/BinaryDictOffdeviceUtils.java | 53 +++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java index 120b96bc6..be75565bb 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java @@ -17,11 +17,16 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; import java.util.HashMap; +import java.util.LinkedList; + +import javax.annotation.Nonnull; /** * Decodes binary files for a FusionDictionary. @@ -360,6 +365,43 @@ public final class BinaryDictDecoderUtils { return result; } + /** + * Helper method that brutally decodes a header from a byte array. + * + * @param headerBuffer a buffer containing the bytes of the header. + * @return a hashmap of the attributes stored in the header + */ + @Nonnull + public static HashMap decodeHeaderAttributes(@Nonnull final byte[] headerBuffer) + throws UnsupportedFormatException { + final StringBuilder sb = new StringBuilder(); + final LinkedList keyValues = new LinkedList<>(); + int index = 0; + while (index < headerBuffer.length) { + if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) { + keyValues.add(sb.toString()); + sb.setLength(0); + } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF, + null /* codePointTable */)) { + sb.appendCodePoint(headerBuffer[index] & 0xFF); + } else { + sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16) + + ((headerBuffer[index + 1] & 0xFF) << 8) + + (headerBuffer[index + 2] & 0xFF)); + index += 2; + } + index += 1; + } + if ((keyValues.size() & 1) != 0) { + throw new UnsupportedFormatException("Odd number of attributes"); + } + final HashMap attributes = new HashMap<>(); + for (int i = 0; i < keyValues.size(); i += 2) { + attributes.put(keyValues.get(i), keyValues.get(i + 1)); + } + return attributes; + } + /** * Helper method to pass a file name instead of a File object to isBinaryDictionary. */ diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java index 49a6e8e14..7894e17c4 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java @@ -19,6 +19,10 @@ package com.android.inputmethod.latin.dicttool; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils; import com.android.inputmethod.latin.makedict.BinaryDictIOUtils; import com.android.inputmethod.latin.makedict.DictDecoder; +import com.android.inputmethod.latin.makedict.DictionaryHeader; +import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.UnsupportedFormatException; @@ -34,6 +38,8 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.util.Arrays; +import java.util.ArrayList; +import java.util.HashMap; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -142,6 +148,53 @@ public final class BinaryDictOffdeviceUtils { } } + public static class HeaderReaderProcessor implements InputProcessor { + // Arbitrarily limit the header length to 32k. Sounds like it would never be larger + // than this. Revisit this if needed later. + private final int MAX_HEADER_LENGTH = 32 * 1024; + @Override @Nonnull + public DictionaryHeader process(final InputStream input) throws IOException, + UnsupportedFormatException { + // Do everything as curtly and ad-hoc as possible for performance. + final byte[] tmpBuffer = new byte[12]; + if (tmpBuffer.length != input.read(tmpBuffer)) { + throw new UnsupportedFormatException("File too short, not a dictionary"); + } + // Ad-hoc check for the magic number. See FormatSpec.java as well as + // byte_array_utils.h and BinaryDictEncoderUtils#writeDictionaryHeader(). + final int MAGIC_NUMBER_START_OFFSET = 0; + final int VERSION_START_OFFSET = 4; + final int HEADER_SIZE_OFFSET = 8; + final int magicNumber = ((tmpBuffer[MAGIC_NUMBER_START_OFFSET] & 0xFF) << 24) + + ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 1] & 0xFF) << 16) + + ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 2] & 0xFF) << 8) + + (tmpBuffer[MAGIC_NUMBER_START_OFFSET + 3] & 0xFF); + if (magicNumber != FormatSpec.MAGIC_NUMBER) { + throw new UnsupportedFormatException("Wrong magic number"); + } + final int version = ((tmpBuffer[VERSION_START_OFFSET] & 0xFF) << 8) + + (tmpBuffer[VERSION_START_OFFSET + 1] & 0xFF); + if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201) { + throw new UnsupportedFormatException("Only versions 2 and 201 are supported"); + } + final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) >> 24) + + ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) >> 16) + + ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) >> 8) + + (tmpBuffer[HEADER_SIZE_OFFSET + 3] & 0xFF); + if (totalHeaderSize > MAX_HEADER_LENGTH) { + throw new UnsupportedFormatException("Header too large"); + } + final byte[] headerBuffer = new byte[totalHeaderSize - tmpBuffer.length]; + if (headerBuffer.length != input.read(headerBuffer)) { + throw new UnsupportedFormatException("File shorter than specified in the header"); + } + final HashMap attributes = + BinaryDictDecoderUtils.decodeHeaderAttributes(headerBuffer); + return new DictionaryHeader(totalHeaderSize, new DictionaryOptions(attributes), + new FormatOptions(version, false /* hasTimestamp */)); + } + } + public static void copy(final InputStream input, final OutputStream output) throws IOException { final byte[] buffer = new byte[COPY_BUFFER_SIZE]; for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) {