diff --git a/src/changes/changes.xml b/src/changes/changes.xml index ce6e8d66cd..24a8e29d1f 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -51,6 +51,7 @@ The type attribute can be add,update,fix,remove. Add Base58 support. Add BaseNCodecInputStream.AbstracBuilder.setByteArray(byte[]). + Add DigestUtils.gitBlob() and DigestUtils.gitTree() to compute Git blob and tree object identifiers. Bump org.apache.commons:commons-parent from 96 to 97. diff --git a/src/main/java/org/apache/commons/codec/digest/DigestUtils.java b/src/main/java/org/apache/commons/codec/digest/DigestUtils.java index 786cc4e5fa..2b5f7cdbd1 100644 --- a/src/main/java/org/apache/commons/codec/digest/DigestUtils.java +++ b/src/main/java/org/apache/commons/codec/digest/DigestUtils.java @@ -18,17 +18,24 @@ package org.apache.commons.codec.digest; import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; +import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.OpenOption; import java.nio.file.Path; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.TreeSet; import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.StringUtils; @@ -139,6 +146,134 @@ public static byte[] digest(final MessageDigest messageDigest, final RandomAcces return updateDigest(messageDigest, data).digest(); } + /** + * Reads through a byte array and return a generalized Git blob identifier + * + *

The identifier is computed in the way described by the + * SWHID contents identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Data to digest. + * @return A generalized Git blob identifier. + * @since 1.22.0 + */ + public static byte[] gitBlob(final MessageDigest messageDigest, final byte[] data) { + messageDigest.reset(); + updateDigest(messageDigest, gitBlobPrefix(data.length)); + return digest(messageDigest, data); + } + + /** + * Reads through a byte array and return a generalized Git blob identifier + * + *

The identifier is computed in the way described by the + * SWHID contents identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Data to digest. + * @param options Options how to open the file + * @return A generalized Git blob identifier. + * @throws IOException On error accessing the file + * @since 1.22.0 + */ + public static byte[] gitBlob(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException { + messageDigest.reset(); + updateDigest(messageDigest, gitBlobPrefix(Files.size(data))); + return updateDigest(messageDigest, data, options).digest(); + } + + private static byte[] gitBlobPrefix(final long dataSize) { + return ("blob " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + } + + /** + * Returns a generalized Git tree identifier + * + *

The identifier is computed in the way described by the + * SWHID directory identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1) + * @param entries The directory entries + * @return A generalized Git tree identifier. + */ + static byte[] gitTree(final MessageDigest messageDigest, final Collection entries) { + final TreeSet treeSet = new TreeSet<>(entries); + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (final GitDirectoryEntry entry : treeSet) { + final byte[] treeEntryBytes = entry.toTreeEntryBytes(); + baos.write(treeEntryBytes, 0, treeEntryBytes.length); + } + messageDigest.reset(); + updateDigest(messageDigest, gitTreePrefix(baos.size())); + return updateDigest(messageDigest, baos.toByteArray()).digest(); + } + + /** + * Reads through a byte array and return a generalized Git tree identifier + * + *

The identifier is computed in the way described by the + * SWHID directory identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Data to digest. + * @param options Options how to open the file + * @return A generalized Git tree identifier. + * @throws IOException On error accessing the file + * @since 1.22.0 + */ + public static byte[] gitTree(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException { + final List entries = new ArrayList<>(); + try (DirectoryStream files = Files.newDirectoryStream(data)) { + for (final Path path : files) { + final GitDirectoryEntry.Type type = getGitDirectoryEntryType(path); + final byte[] rawObjectId; + if (type == GitDirectoryEntry.Type.DIRECTORY) { + rawObjectId = gitTree(messageDigest, path, options); + } else { + rawObjectId = gitBlob(messageDigest, path, options); + } + entries.add(new GitDirectoryEntry(path, type, rawObjectId)); + } + } + return gitTree(messageDigest, entries); + } + + /** + * Returns the {@link GitDirectoryEntry.Type} of a file. + * + * @param path The file to check. + * @return A {@link GitDirectoryEntry.Type} + */ + private static GitDirectoryEntry.Type getGitDirectoryEntryType(final Path path) { + // Symbolic links first + if (Files.isSymbolicLink(path)) { + return GitDirectoryEntry.Type.SYMBOLIC_LINK; + } + if (Files.isDirectory(path)) { + return GitDirectoryEntry.Type.DIRECTORY; + } + if (Files.isExecutable(path)) { + return GitDirectoryEntry.Type.EXECUTABLE; + } + return GitDirectoryEntry.Type.REGULAR; + } + + private static byte[] gitTreePrefix(final long dataSize) { + return ("tree " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + } + /** * Gets a {@code MessageDigest} for the given {@code algorithm}. * diff --git a/src/main/java/org/apache/commons/codec/digest/GitDirectoryEntry.java b/src/main/java/org/apache/commons/codec/digest/GitDirectoryEntry.java new file mode 100644 index 0000000000..e1073611fc --- /dev/null +++ b/src/main/java/org/apache/commons/codec/digest/GitDirectoryEntry.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.digest; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.Objects; + +/** + * Represents a single entry in a Git tree object. + * + *

A Git tree object encodes a directory snapshot. Each entry holds:

+ *
    + *
  • a {@link Type} that determines the Unix file mode (e.g. {@code 100644} for a regular file),
  • + *
  • the entry name (file or directory name, without a path separator),
  • + *
  • the raw object id of the referenced blob or sub-tree.
  • + *
+ * + *

Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/} + * sorts after {@code foobar}.

+ * + *

Call {@link #toTreeEntryBytes()} to obtain the binary encoding that Git feeds to its hash function when computing the tree object identifier.

+ * + * @see Git Internals – Git Objects + * @see SWHID Directory Identifier + */ +class GitDirectoryEntry implements Comparable { + + /** + * The entry name (file or directory name, no path separator). + */ + private final String name; + + /** + * The key used for ordering entries within a tree object. + * + *

>Git appends {@code '/'} to directory names before comparing.

+ */ + private final String sortKey; + + /** + * The Git object type, which determines the Unix file-mode prefix. + */ + private final Type type; + + /** + * The raw object id of the referenced blob or sub-tree. + */ + private final byte[] rawObjectId; + + private static String getFileName(final Path path) { + final Path fileName = path.getFileName(); + if (fileName == null) { + throw new IllegalArgumentException(path.toString()); + } + return fileName.toString(); + } + + /** + * Creates an entry + * + * @param name The name of the entry + * @param type The type of the entry + * @param rawObjectId The id of the entry + */ + private GitDirectoryEntry(final String name, final Type type, final byte[] rawObjectId) { + this.name = name; + this.type = type; + this.sortKey = type == Type.DIRECTORY ? name + "/" : name; + this.rawObjectId = rawObjectId; + } + + /** + * Creates an entry + * + * @param path The path of the entry; must not be an empty path + * @param type The type of the entry + * @param rawObjectId The id of the entry + * @throws IllegalArgumentException If the path is empty + * @throws NullPointerException If any argument is {@code null} + */ + GitDirectoryEntry(final Path path, final Type type, final byte[] rawObjectId) { + this(getFileName(path), Objects.requireNonNull(type), Objects.requireNonNull(rawObjectId)); + } + + /** + * Returns the binary encoding of this entry as it appears inside a Git tree object. + * + *

The format follows the Git tree entry layout:

+ *
+     *   <mode> SP <name> NUL <20-byte-object-id>
+     * 
+ * + * @return the binary tree-entry encoding; never {@code null} + */ + byte[] toTreeEntryBytes() { + final byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); + final byte[] result = new byte[type.mode.length + nameBytes.length + rawObjectId.length + 2]; + System.arraycopy(type.mode, 0, result, 0, type.mode.length); + result[type.mode.length] = ' '; + System.arraycopy(nameBytes, 0, result, type.mode.length + 1, nameBytes.length); + result[type.mode.length + nameBytes.length + 1] = '\0'; + System.arraycopy(rawObjectId, 0, result, type.mode.length + nameBytes.length + 2, rawObjectId.length); + return result; + } + + @Override + public int compareTo(GitDirectoryEntry o) { + return sortKey.compareTo(o.sortKey); + } + + @Override + public int hashCode() { + return name.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof GitDirectoryEntry)) { + return false; + } + final GitDirectoryEntry other = (GitDirectoryEntry) obj; + return name.equals(other.name); + } + + /** + * The type of a Git tree entry, which maps to a Unix file-mode string. + * + *

Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here + * cover the four entry types that Git itself produces.

+ * + *

This enum is package-private. If it were made public, {@link #mode} would need to be wrapped in an immutable copy to prevent external mutation.

+ */ + enum Type { + + /** + * A sub-directory (Git sub-tree) + */ + DIRECTORY("40000"), + + /** + * An executable file + */ + EXECUTABLE("100755"), + + /** + * A regular (non-executable) file + */ + REGULAR("100644"), + + /** + * A symbolic link + */ + SYMBOLIC_LINK("120000"); + + /** + * The ASCII-encoded octal mode string as it appears in the binary tree entry. + */ + private final byte[] mode; + + Type(final String mode) { + this.mode = mode.getBytes(StandardCharsets.US_ASCII); + } + } +} diff --git a/src/test/java/org/apache/commons/codec/digest/DigestUtilsTest.java b/src/test/java/org/apache/commons/codec/digest/DigestUtilsTest.java index b27705b5d8..01fcce06a8 100644 --- a/src/test/java/org/apache/commons/codec/digest/DigestUtilsTest.java +++ b/src/test/java/org/apache/commons/codec/digest/DigestUtilsTest.java @@ -32,11 +32,14 @@ import java.io.OutputStream; import java.io.RandomAccessFile; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.security.MessageDigest; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import java.util.Locale; import java.util.Random; import java.util.stream.Stream; @@ -52,6 +55,7 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; /** * Tests {@link DigestUtils}. @@ -238,6 +242,16 @@ class DigestUtilsTest { "CA 92 BF 0B E5 61 5E 96 95 9D 76 71 97 A0 BE EB"; // @formatter:on + static Stream gitBlobProvider() { + return Stream.of(Arguments.of("DigestUtilsTest/hello.txt", "5f4a83288e67f1be2d6fcdad84165a86c6a970d7"), + Arguments.of("DigestUtilsTest/greetings.txt", "6cf4f797455661e61d1ee6913fc29344f5897243"), + Arguments.of("DigestUtilsTest/subdir/nested.txt", "07a392ddb4dbff06a373a7617939f30b2dcfe719")); + } + + private static Path resourcePath(final String resourceName) throws Exception { + return Paths.get(DigestUtilsTest.class.getClassLoader().getResource(resourceName).toURI()); + } + static Stream testShake128_256() { // @formatter:off return Stream.of( @@ -475,6 +489,64 @@ void testGetMessageDigest() { assertEquals(MessageDigestAlgorithms.MD5, digestUtils.getMessageDigest().getAlgorithm()); } + @ParameterizedTest + @MethodSource("gitBlobProvider") + void testGitBlobByteArray(final String resourceName, final String expectedSha1Hex) throws Exception { + final byte[] data = Files.readAllBytes(resourcePath(resourceName)); + assertArrayEquals(Hex.decodeHex(expectedSha1Hex), DigestUtils.gitBlob(DigestUtils.getSha1Digest(), data)); + } + + @ParameterizedTest + @MethodSource("gitBlobProvider") + void testGitBlobPath(final String resourceName, final String expectedSha1Hex) throws Exception { + assertArrayEquals(Hex.decodeHex(expectedSha1Hex), DigestUtils.gitBlob(DigestUtils.getSha1Digest(), resourcePath(resourceName))); + } + + /** + * Binary body of the test tree object used in {@link #testGitTreeCollection}. + * + *

Each entry has the format {@code SP NUL <20-byte-object-id>}.

+ */ + private static final String TREE_BODY_HEX = + // 100644 hello.txt\0 + objectId + "3130303634342068656c6c6f2e74787400" + "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0" + + // 120000 link.txt\0 + objectId + "313230303030206c696e6b2e74787400" + "1234567890abcdef1234567890abcdef12345678" + + // 100755 run.sh\0 + objectId + "3130303735352072756e2e736800" + "f0e1d2c3b4a5f6e7d8c9b0a1f2e3d4c5b6a7f8e9" + + // 40000 src\0 + objectId + "34303030302073726300" + "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"; + + @ParameterizedTest + @ValueSource(strings = {MessageDigestAlgorithms.SHA_1, MessageDigestAlgorithms.SHA_256}) + void testGitTreeCollection(final String algorithm) throws Exception { + final byte[] helloId = Hex.decodeHex("a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"); + final byte[] runId = Hex.decodeHex("f0e1d2c3b4a5f6e7d8c9b0a1f2e3d4c5b6a7f8e9"); + final byte[] linkId = Hex.decodeHex("1234567890abcdef1234567890abcdef12345678"); + final byte[] srcId = Hex.decodeHex("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"); + + // Entries are supplied out of order to verify that the method sorts them correctly. + final List entries = new ArrayList<>(); + entries.add(new GitDirectoryEntry(Paths.get("src"), GitDirectoryEntry.Type.DIRECTORY, srcId)); + entries.add(new GitDirectoryEntry(Paths.get("run.sh"), GitDirectoryEntry.Type.EXECUTABLE, runId)); + entries.add(new GitDirectoryEntry(Paths.get("hello.txt"), GitDirectoryEntry.Type.REGULAR, helloId)); + entries.add(new GitDirectoryEntry(Paths.get("link.txt"), GitDirectoryEntry.Type.SYMBOLIC_LINK, linkId)); + + // Compute expected value + final byte[] treeBody = Hex.decodeHex(TREE_BODY_HEX); + final MessageDigest md = DigestUtils.getDigest(algorithm); + DigestUtils.updateDigest(md, ("tree " + treeBody.length + "\0").getBytes(StandardCharsets.UTF_8)); + final byte[] expected = DigestUtils.updateDigest(md, treeBody).digest(); + + assertArrayEquals(expected, DigestUtils.gitTree(md, entries)); + } + + @Test + void testGitTreePath() throws Exception { + assertArrayEquals(Hex.decodeHex("e4b21f6d78ceba6eb7c211ac15e3337ec4614e8a"), + DigestUtils.gitTree(DigestUtils.getSha1Digest(), resourcePath("DigestUtilsTest"))); + } + @Test void testInternalNoSuchAlgorithmException() { assertThrows(IllegalArgumentException.class, () -> DigestUtils.getDigest("Bogus Bogus")); diff --git a/src/test/java/org/apache/commons/codec/digest/GitDirectoryEntryTest.java b/src/test/java/org/apache/commons/codec/digest/GitDirectoryEntryTest.java new file mode 100644 index 0000000000..ce37c0e1ef --- /dev/null +++ b/src/test/java/org/apache/commons/codec/digest/GitDirectoryEntryTest.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.digest; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; + +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Test; + +class GitDirectoryEntryTest { + + private static final byte[] ZERO_ID = new byte[20]; + + /** + * The Path constructor must extract the filename component. + */ + @Test + void testPathConstructorUsesFilename() { + final GitDirectoryEntry fromLabel = new GitDirectoryEntry(Paths.get("hello.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); + final GitDirectoryEntry fromRelative = new GitDirectoryEntry(Paths.get("subdir/hello.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); + final GitDirectoryEntry fromAbsolute = new GitDirectoryEntry(Paths.get("hello.txt").toAbsolutePath(), GitDirectoryEntry.Type.REGULAR, ZERO_ID); + + assertEquals(fromLabel, fromRelative); + assertEquals(fromLabel, fromAbsolute); + assertArrayEquals(fromLabel.toTreeEntryBytes(), fromRelative.toTreeEntryBytes()); + assertArrayEquals(fromLabel.toTreeEntryBytes(), fromAbsolute.toTreeEntryBytes()); + } + + /** + * Equality and hash code are based solely on the entry name. + */ + @Test + void testEqualityBasedOnNameOnly() { + final byte[] otherId = new byte[20]; + Arrays.fill(otherId, (byte) 0xff); + + final GitDirectoryEntry regular = new GitDirectoryEntry(Paths.get("foo"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); + final GitDirectoryEntry executable = new GitDirectoryEntry(Paths.get("foo"), GitDirectoryEntry.Type.EXECUTABLE, otherId); + + // Same name, different type and object id -> equal + assertEquals(regular, executable); + assertEquals(regular.hashCode(), executable.hashCode()); + + // Different name -> not equal + assertNotEquals(regular, new GitDirectoryEntry(Paths.get("bar"), GitDirectoryEntry.Type.REGULAR, ZERO_ID)); + + // Same reference -> equal + assertEquals(regular, regular); + + // Not equal to null or unrelated type + assertNotEquals(regular, null); + assertNotEquals(regular, "foo"); + } + + /** + * Entries should be sorted by Git sort rule. + * + *

Git compares the names of the entries, but adds a {@code /} at the end of directory entries.

+ */ + @Test + void testSortOrder() { + final GitDirectoryEntry alpha = new GitDirectoryEntry(Paths.get("alpha.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); + final GitDirectoryEntry fooTxt = new GitDirectoryEntry(Paths.get("foo.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); + final GitDirectoryEntry fooDir = new GitDirectoryEntry(Paths.get("foo"), GitDirectoryEntry.Type.DIRECTORY, ZERO_ID); + final GitDirectoryEntry foobar = new GitDirectoryEntry(Paths.get("foobar"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); + final GitDirectoryEntry zeta = new GitDirectoryEntry(Paths.get("zeta.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); + + final List entries = new ArrayList<>(Arrays.asList(zeta, foobar, fooDir, alpha, fooTxt)); + entries.sort(GitDirectoryEntry::compareTo); + + assertEquals(Arrays.asList(alpha, fooTxt, fooDir, foobar, zeta), entries); + } +} diff --git a/src/test/resources/DigestUtilsTest/greetings.txt b/src/test/resources/DigestUtilsTest/greetings.txt new file mode 100644 index 0000000000..6cf4f79745 --- /dev/null +++ b/src/test/resources/DigestUtilsTest/greetings.txt @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: Apache-2.0 +Greetings! diff --git a/src/test/resources/DigestUtilsTest/hello.txt b/src/test/resources/DigestUtilsTest/hello.txt new file mode 100644 index 0000000000..5f4a83288e --- /dev/null +++ b/src/test/resources/DigestUtilsTest/hello.txt @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: Apache-2.0 +Hello, World! diff --git a/src/test/resources/DigestUtilsTest/subdir/nested.txt b/src/test/resources/DigestUtilsTest/subdir/nested.txt new file mode 100644 index 0000000000..07a392ddb4 --- /dev/null +++ b/src/test/resources/DigestUtilsTest/subdir/nested.txt @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: Apache-2.0 +Nested file.