Add a header command to dicttool.

This will allow to greatly improve the performance of the
metadata-generating files, as they won't have to wait for
the info command to read the entire dictionary when the
header is all we need.

Also add tests, and while we're at it, use the seed as
intended to enable reproducible tests.

Change-Id: I0ba79ef62f0292b23e63aed57ff565bb102281a2
This commit is contained in:
Jean Chalard 2014-10-20 21:53:31 +09:00
parent f6b0e32df3
commit 5b33d197ba
4 changed files with 162 additions and 17 deletions

View File

@ -24,8 +24,13 @@ import android.util.SparseArray;
import com.android.inputmethod.latin.BinaryDictionary; import com.android.inputmethod.latin.BinaryDictionary;
import com.android.inputmethod.latin.common.CodePointUtils; import com.android.inputmethod.latin.common.CodePointUtils;
import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils;
import com.android.inputmethod.latin.dicttool.Compress;
import com.android.inputmethod.latin.dicttool.Crypt;
import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils.DecoderChainSpec;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
@ -67,6 +72,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
private static final SparseArray<List<Integer>> sChainBigrams = new SparseArray<>(); private static final SparseArray<List<Integer>> sChainBigrams = new SparseArray<>();
private static final HashMap<String, List<String>> sShortcuts = new HashMap<>(); private static final HashMap<String, List<String>> sShortcuts = new HashMap<>();
final Random mRandom;
public BinaryDictDecoderEncoderTests() { public BinaryDictDecoderEncoderTests() {
this(System.currentTimeMillis(), DEFAULT_MAX_UNIGRAMS); this(System.currentTimeMillis(), DEFAULT_MAX_UNIGRAMS);
} }
@ -75,10 +82,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
super(); super();
BinaryDictionaryUtils.setCurrentTimeForTest(0); BinaryDictionaryUtils.setCurrentTimeForTest(0);
Log.e(TAG, "Testing dictionary: seed is " + seed); Log.e(TAG, "Testing dictionary: seed is " + seed);
final Random random = new Random(seed); mRandom = new Random(seed);
sWords.clear(); sWords.clear();
sWordsWithVariousCodePoints.clear(); sWordsWithVariousCodePoints.clear();
generateWords(maxUnigrams, random); generateWords(maxUnigrams, mRandom);
for (int i = 0; i < sWords.size(); ++i) { for (int i = 0; i < sWords.size(); ++i) {
sChainBigrams.put(i, new ArrayList<Integer>()); sChainBigrams.put(i, new ArrayList<Integer>());
@ -96,10 +103,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
sShortcuts.clear(); sShortcuts.clear();
for (int i = 0; i < NUM_OF_NODES_HAVING_SHORTCUTS; ++i) { for (int i = 0; i < NUM_OF_NODES_HAVING_SHORTCUTS; ++i) {
final int from = Math.abs(random.nextInt()) % sWords.size(); final int from = Math.abs(mRandom.nextInt()) % sWords.size();
sShortcuts.put(sWords.get(from), new ArrayList<String>()); sShortcuts.put(sWords.get(from), new ArrayList<String>());
for (int j = 0; j < NUM_OF_SHORTCUTS; ++j) { for (int j = 0; j < NUM_OF_SHORTCUTS; ++j) {
final int to = Math.abs(random.nextInt()) % sWords.size(); final int to = Math.abs(mRandom.nextInt()) % sWords.size();
sShortcuts.get(sWords.get(from)).add(sWords.get(to)); sShortcuts.get(sWords.get(from)).add(sWords.get(to));
} }
} }
@ -604,11 +611,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
+ " : " + outputOptions(bufferType, formatOptions)); + " : " + outputOptions(bufferType, formatOptions));
// Test a word that isn't contained within the dictionary. // Test a word that isn't contained within the dictionary.
final Random random = new Random((int)System.currentTimeMillis());
final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE,
random); mRandom);
for (int i = 0; i < 1000; ++i) { for (int i = 0; i < 1000; ++i) {
final String word = CodePointUtils.generateWord(random, codePointSet); final String word = CodePointUtils.generateWord(mRandom, codePointSet);
if (sWords.indexOf(word) != -1) continue; if (sWords.indexOf(word) != -1) continue;
checkGetTerminalPosition(dictDecoder, word, false); checkGetTerminalPosition(dictDecoder, word, false);
} }
@ -731,4 +737,61 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
assertTrue(wordSet.isEmpty()); assertTrue(wordSet.isEmpty());
assertTrue(bigramSet.isEmpty()); assertTrue(bigramSet.isEmpty());
} }
public void runTestHeaderReaderProcessorWithOneSpec(final boolean compress, final boolean crypt)
throws IOException {
final String dictName = "testHeaderReaderProcessor";
final String dictVersion = Long.toString(System.currentTimeMillis());
final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS;
final int MAX_NUMBER_OF_OPTIONS_TO_ADD = 5;
final HashMap<String, String> options = new HashMap<>();
// Required attributes
options.put("dictionary", "main:en_US");
options.put("locale", "en_US");
options.put("version", Integer.toString(mRandom.nextInt()));
// Add some random options for test
final int numberOfOptionsToAdd = mRandom.nextInt() % (MAX_NUMBER_OF_OPTIONS_TO_ADD + 1);
for (int i = 0; i < numberOfOptionsToAdd; ++i) {
options.put(sWordsWithVariousCodePoints.get(2 * i),
sWordsWithVariousCodePoints.get(2 * 1 + 1));
}
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
new DictionaryOptions(options));
addUnigrams(sWords.size(), dict, sWords, null);
File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions,
getContext().getCacheDir());
timeWritingDictToFile(file, dict, formatOptions);
if (compress) {
final File rawFile = file;
file = BinaryDictUtils.getDictFile(dictName + "compress", dictVersion, formatOptions,
getContext().getCacheDir());
final Compress.Compressor compressCommand = new Compress.Compressor();
compressCommand.setArgs(new String[] { rawFile.getPath(), file.getPath() });
compressCommand.run();
}
if (crypt) {
final File rawFile = file;
file = BinaryDictUtils.getDictFile(dictName + "crypt", dictVersion, formatOptions,
getContext().getCacheDir());
final Crypt.Encrypter cryptCommand = new Crypt.Encrypter();
cryptCommand.setArgs(new String[] { rawFile.getPath(), file.getPath() });
cryptCommand.run();
}
final DecoderChainSpec<DictionaryHeader> spec =
BinaryDictOffdeviceUtils.decodeDictionaryForProcess(file,
new BinaryDictOffdeviceUtils.HeaderReaderProcessor());
assertNotNull("Can't decode a dictionary we just wrote : " + file, spec);
final DictionaryHeader header = spec.mResult;
assertEquals("raw" + (crypt ? " > encryption" : "") + (compress ? " > compression" : ""),
spec.describeChain());
assertEquals(header.mDictionaryOptions.mAttributes, options);
}
public void testHeaderReaderProcessor() throws IOException {
runTestHeaderReaderProcessorWithOneSpec(false /* compress */, false /* crypt */);
runTestHeaderReaderProcessorWithOneSpec(true /* compress */, false /* crypt */);
runTestHeaderReaderProcessorWithOneSpec(true /* compress */, true /* crypt */);
}
} }

View File

@ -65,7 +65,7 @@ public final class BinaryDictOffdeviceUtils {
}; };
private final int mDecoderSpecIndex; private final int mDecoderSpecIndex;
T mResult; public T mResult;
public DecoderChainSpec() { public DecoderChainSpec() {
mDecoderSpecIndex = 0; mDecoderSpecIndex = 0;
@ -174,12 +174,13 @@ public final class BinaryDictOffdeviceUtils {
} }
final int version = ((tmpBuffer[VERSION_START_OFFSET] & 0xFF) << 8) final int version = ((tmpBuffer[VERSION_START_OFFSET] & 0xFF) << 8)
+ (tmpBuffer[VERSION_START_OFFSET + 1] & 0xFF); + (tmpBuffer[VERSION_START_OFFSET + 1] & 0xFF);
if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201) { if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201
throw new UnsupportedFormatException("Only versions 2 and 201 are supported"); && version != FormatSpec.VERSION202) {
throw new UnsupportedFormatException("Only versions 2, 201, 202 are supported");
} }
final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) >> 24) final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) << 24)
+ ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) >> 16) + ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) << 16)
+ ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) >> 8) + ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) << 8)
+ (tmpBuffer[HEADER_SIZE_OFFSET + 3] & 0xFF); + (tmpBuffer[HEADER_SIZE_OFFSET + 3] & 0xFF);
if (totalHeaderSize > MAX_HEADER_LENGTH) { if (totalHeaderSize > MAX_HEADER_LENGTH) {
throw new UnsupportedFormatException("Header too large"); throw new UnsupportedFormatException("Header too large");
@ -215,11 +216,22 @@ public final class BinaryDictOffdeviceUtils {
@Nonnull DecoderChainSpec spec = new DecoderChainSpec(); @Nonnull DecoderChainSpec spec = new DecoderChainSpec();
while (null != spec) { while (null != spec) {
try { try {
try (final InputStream input = spec.getStream(src)) { final InputStream input = spec.getStream(src);
spec.mResult = processor.process(input); spec.mResult = processor.process(input);
return spec; try {
input.close();
} catch (IOException e) {
// CipherInputStream doesn't like being closed without having read the
// entire stream, for some reason. But we don't want to because it's a waste
// of resources. We really, really don't care about this.
// However on close() CipherInputStream does throw this exception, wrapped
// in an IOException so we need to catch it.
if (!(e.getCause() instanceof javax.crypto.BadPaddingException)) {
throw e;
} }
} catch (IOException | UnsupportedFormatException e) { }
return spec;
} catch (IOException | UnsupportedFormatException | ArrayIndexOutOfBoundsException e) {
// If the format is not the right one for this file, the processor will throw one // If the format is not the right one for this file, the processor will throw one
// of these exceptions. In our case, that means we should try the next spec, // of these exceptions. In our case, that means we should try the next spec,
// since it may still be at another format we haven't tried yet. // since it may still be at another format we haven't tried yet.

View File

@ -20,6 +20,7 @@ public class CommandList {
public static void populate() { public static void populate() {
// TODO: Move some commands to native code. // TODO: Move some commands to native code.
Dicttool.addCommand("info", Info.class); Dicttool.addCommand("info", Info.class);
Dicttool.addCommand("header", Header.class);
Dicttool.addCommand("diff", Diff.class); Dicttool.addCommand("diff", Diff.class);
Dicttool.addCommand("compress", Compress.Compressor.class); Dicttool.addCommand("compress", Compress.Compressor.class);
Dicttool.addCommand("uncompress", Compress.Uncompressor.class); Dicttool.addCommand("uncompress", Compress.Uncompressor.class);

View File

@ -0,0 +1,69 @@
/**
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.android.inputmethod.latin.dicttool;
import com.android.inputmethod.latin.BinaryDictionary;
import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils.DecoderChainSpec;
import com.android.inputmethod.latin.makedict.DictionaryHeader;
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
import java.io.File;
import java.util.Arrays;
import java.util.Locale;
public class Header extends Dicttool.Command {
public static final String COMMAND = "header";
public Header() {
}
@Override
public String getHelp() {
return COMMAND + " <filename>: prints the header contents of a dictionary file";
}
@Override
public void run() throws UnsupportedFormatException {
final boolean plumbing;
if (mArgs.length > 0 && "-p".equals(mArgs[0])) {
plumbing = true;
mArgs = Arrays.copyOfRange(mArgs, 1, mArgs.length);
} else {
plumbing = false;
}
if (mArgs.length < 1) {
throw new RuntimeException("Not enough arguments for command " + COMMAND);
}
final String filename = mArgs[0];
final File dictFile = new File(filename);
final DecoderChainSpec<DictionaryHeader> spec =
BinaryDictOffdeviceUtils.decodeDictionaryForProcess(dictFile,
new BinaryDictOffdeviceUtils.HeaderReaderProcessor());
if (null == spec) {
throw new UnsupportedFormatException(filename
+ " doesn't seem to be a valid version 2 dictionary file");
}
final DictionaryHeader header = spec.mResult;
System.out.println("Dictionary : " + dictFile.getAbsolutePath());
System.out.println("Size : " + dictFile.length() + " bytes");
System.out.println("Format : Binary dictionary format");
System.out.println("Packaging : " + spec.describeChain());
System.out.println("Header attributes :");
System.out.print(header.mDictionaryOptions.toString(2 /* indentCount */, plumbing));
}
}