Add a header command to dicttool.

This will allow to greatly improve the performance of the metadata-generating files, as they won't have to wait for the info command to read the entire dictionary when the header is all we need. Also add tests, and while we're at it, use the seed as intended to enable reproducible tests. Change-Id: I0ba79ef62f0292b23e63aed57ff565bb102281a2
2024-09-28 14:54:30 +01:00 · 2014-10-20 21:53:31 +09:00 · 2014-10-20 21:53:31 +09:00 · 5b33d197ba
commit 5b33d197ba
parent f6b0e32df3
4 changed files with 162 additions and 17 deletions
--- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java
+++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java
@ -24,8 +24,13 @@ import android.util.SparseArray;

 import com.android.inputmethod.latin.BinaryDictionary;
 import com.android.inputmethod.latin.common.CodePointUtils;
+import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils;
+import com.android.inputmethod.latin.dicttool.Compress;
+import com.android.inputmethod.latin.dicttool.Crypt;
+import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils.DecoderChainSpec;
 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
+import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
@ -67,6 +72,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
    private static final SparseArray<List<Integer>> sChainBigrams = new SparseArray<>();
    private static final HashMap<String, List<String>> sShortcuts = new HashMap<>();

+    final Random mRandom;
+
    public BinaryDictDecoderEncoderTests() {
        this(System.currentTimeMillis(), DEFAULT_MAX_UNIGRAMS);
    }
@ -75,10 +82,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
        super();
        BinaryDictionaryUtils.setCurrentTimeForTest(0);
        Log.e(TAG, "Testing dictionary: seed is " + seed);
-        final Random random = new Random(seed);
+        mRandom = new Random(seed);
        sWords.clear();
        sWordsWithVariousCodePoints.clear();
-        generateWords(maxUnigrams, random);
+        generateWords(maxUnigrams, mRandom);

        for (int i = 0; i < sWords.size(); ++i) {
            sChainBigrams.put(i, new ArrayList<Integer>());
@ -96,10 +103,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {

        sShortcuts.clear();
        for (int i = 0; i < NUM_OF_NODES_HAVING_SHORTCUTS; ++i) {
-            final int from = Math.abs(random.nextInt()) % sWords.size();
+            final int from = Math.abs(mRandom.nextInt()) % sWords.size();
            sShortcuts.put(sWords.get(from), new ArrayList<String>());
            for (int j = 0; j < NUM_OF_SHORTCUTS; ++j) {
-                final int to = Math.abs(random.nextInt()) % sWords.size();
+                final int to = Math.abs(mRandom.nextInt()) % sWords.size();
                sShortcuts.get(sWords.get(from)).add(sWords.get(to));
            }
        }
@ -604,11 +611,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
                + " : " + outputOptions(bufferType, formatOptions));

        // Test a word that isn't contained within the dictionary.
-        final Random random = new Random((int)System.currentTimeMillis());
        final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE,
-                random);
+                mRandom);
        for (int i = 0; i < 1000; ++i) {
-            final String word = CodePointUtils.generateWord(random, codePointSet);
+            final String word = CodePointUtils.generateWord(mRandom, codePointSet);
            if (sWords.indexOf(word) != -1) continue;
            checkGetTerminalPosition(dictDecoder, word, false);
        }
@ -731,4 +737,61 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
        assertTrue(wordSet.isEmpty());
        assertTrue(bigramSet.isEmpty());
    }
+
+    public void runTestHeaderReaderProcessorWithOneSpec(final boolean compress, final boolean crypt)
+                throws IOException {
+        final String dictName = "testHeaderReaderProcessor";
+        final String dictVersion = Long.toString(System.currentTimeMillis());
+        final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS;
+        final int MAX_NUMBER_OF_OPTIONS_TO_ADD = 5;
+        final HashMap<String, String> options = new HashMap<>();
+        // Required attributes
+        options.put("dictionary", "main:en_US");
+        options.put("locale", "en_US");
+        options.put("version", Integer.toString(mRandom.nextInt()));
+        // Add some random options for test
+        final int numberOfOptionsToAdd = mRandom.nextInt() % (MAX_NUMBER_OF_OPTIONS_TO_ADD + 1);
+        for (int i = 0; i < numberOfOptionsToAdd; ++i) {
+            options.put(sWordsWithVariousCodePoints.get(2 * i),
+                    sWordsWithVariousCodePoints.get(2 * 1 + 1));
+        }
+        final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
+                new DictionaryOptions(options));
+        addUnigrams(sWords.size(), dict, sWords, null);
+        File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions,
+                getContext().getCacheDir());
+        timeWritingDictToFile(file, dict, formatOptions);
+
+        if (compress) {
+            final File rawFile = file;
+            file = BinaryDictUtils.getDictFile(dictName + "compress", dictVersion, formatOptions,
+                    getContext().getCacheDir());
+            final Compress.Compressor compressCommand = new Compress.Compressor();
+            compressCommand.setArgs(new String[] { rawFile.getPath(), file.getPath() });
+            compressCommand.run();
+        }
+        if (crypt) {
+            final File rawFile = file;
+            file = BinaryDictUtils.getDictFile(dictName + "crypt", dictVersion, formatOptions,
+                    getContext().getCacheDir());
+            final Crypt.Encrypter cryptCommand = new Crypt.Encrypter();
+            cryptCommand.setArgs(new String[] { rawFile.getPath(), file.getPath() });
+            cryptCommand.run();
+        }
+
+        final DecoderChainSpec<DictionaryHeader> spec =
+                BinaryDictOffdeviceUtils.decodeDictionaryForProcess(file,
+                        new BinaryDictOffdeviceUtils.HeaderReaderProcessor());
+        assertNotNull("Can't decode a dictionary we just wrote : " + file, spec);
+        final DictionaryHeader header = spec.mResult;
+        assertEquals("raw" + (crypt ? " > encryption" : "") + (compress ? " > compression" : ""),
+                spec.describeChain());
+        assertEquals(header.mDictionaryOptions.mAttributes, options);
+    }
+
+    public void testHeaderReaderProcessor() throws IOException {
+        runTestHeaderReaderProcessorWithOneSpec(false /* compress */, false /* crypt */);
+        runTestHeaderReaderProcessorWithOneSpec(true /* compress */, false /* crypt */);
+        runTestHeaderReaderProcessorWithOneSpec(true /* compress */, true /* crypt */);
+    }
 }
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java
@ -65,7 +65,7 @@ public final class BinaryDictOffdeviceUtils {
        };

        private final int mDecoderSpecIndex;
-        T mResult;
+        public T mResult;

        public DecoderChainSpec() {
            mDecoderSpecIndex = 0;
@ -174,12 +174,13 @@ public final class BinaryDictOffdeviceUtils {
            }
            final int version = ((tmpBuffer[VERSION_START_OFFSET] & 0xFF) << 8)
                    + (tmpBuffer[VERSION_START_OFFSET + 1] & 0xFF);
-            if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201) {
-                throw new UnsupportedFormatException("Only versions 2 and 201 are supported");
+            if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201
+                    && version != FormatSpec.VERSION202) {
+                throw new UnsupportedFormatException("Only versions 2, 201, 202 are supported");
            }
-            final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) >> 24)
-                    + ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) >> 16)
-                    + ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) >> 8)
+            final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) << 24)
+                    + ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) << 16)
+                    + ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) << 8)
                    + (tmpBuffer[HEADER_SIZE_OFFSET + 3] & 0xFF);
            if (totalHeaderSize > MAX_HEADER_LENGTH) {
                throw new UnsupportedFormatException("Header too large");
@ -215,11 +216,22 @@ public final class BinaryDictOffdeviceUtils {
        @Nonnull DecoderChainSpec spec = new DecoderChainSpec();
        while (null != spec) {
            try {
-                try (final InputStream input = spec.getStream(src)) {
-                    spec.mResult = processor.process(input);
-                    return spec;
+                final InputStream input = spec.getStream(src);
+                spec.mResult = processor.process(input);
+                try {
+                    input.close();
+                } catch (IOException e) {
+                    // CipherInputStream doesn't like being closed without having read the
+                    // entire stream, for some reason. But we don't want to because it's a waste
+                    // of resources. We really, really don't care about this.
+                    // However on close() CipherInputStream does throw this exception, wrapped
+                    // in an IOException so we need to catch it.
+                    if (!(e.getCause() instanceof javax.crypto.BadPaddingException)) {
+                        throw e;
+                    }
                }
-            } catch (IOException | UnsupportedFormatException e) {
+                return spec;
+            } catch (IOException | UnsupportedFormatException | ArrayIndexOutOfBoundsException e) {
                // If the format is not the right one for this file, the processor will throw one
                // of these exceptions. In our case, that means we should try the next spec,
                // since it may still be at another format we haven't tried yet.
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java
@ -20,6 +20,7 @@ public class CommandList {
    public static void populate() {
        // TODO: Move some commands to native code.
        Dicttool.addCommand("info", Info.class);
+        Dicttool.addCommand("header", Header.class);
        Dicttool.addCommand("diff", Diff.class);
        Dicttool.addCommand("compress", Compress.Compressor.class);
        Dicttool.addCommand("uncompress", Compress.Uncompressor.class);
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Header.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Header.java
@ -0,0 +1,69 @@
+/**
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.android.inputmethod.latin.dicttool;
+
+import com.android.inputmethod.latin.BinaryDictionary;
+import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils.DecoderChainSpec;
+import com.android.inputmethod.latin.makedict.DictionaryHeader;
+import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Locale;
+
+public class Header extends Dicttool.Command {
+    public static final String COMMAND = "header";
+
+    public Header() {
+    }
+
+    @Override
+    public String getHelp() {
+        return COMMAND + " <filename>: prints the header contents of a dictionary file";
+    }
+
+    @Override
+    public void run() throws UnsupportedFormatException {
+        final boolean plumbing;
+        if (mArgs.length > 0 && "-p".equals(mArgs[0])) {
+            plumbing = true;
+            mArgs = Arrays.copyOfRange(mArgs, 1, mArgs.length);
+        } else {
+            plumbing = false;
+        }
+        if (mArgs.length < 1) {
+            throw new RuntimeException("Not enough arguments for command " + COMMAND);
+        }
+        final String filename = mArgs[0];
+        final File dictFile = new File(filename);
+        final DecoderChainSpec<DictionaryHeader> spec =
+                BinaryDictOffdeviceUtils.decodeDictionaryForProcess(dictFile,
+                        new BinaryDictOffdeviceUtils.HeaderReaderProcessor());
+        if (null == spec) {
+            throw new UnsupportedFormatException(filename
+                    + " doesn't seem to be a valid version 2 dictionary file");
+        }
+
+        final DictionaryHeader header = spec.mResult;
+        System.out.println("Dictionary : " + dictFile.getAbsolutePath());
+        System.out.println("Size : " + dictFile.length() + " bytes");
+        System.out.println("Format : Binary dictionary format");
+        System.out.println("Packaging : " + spec.describeChain());
+        System.out.println("Header attributes :");
+        System.out.print(header.mDictionaryOptions.toString(2 /* indentCount */, plumbing));
+    }
+}