Skip to content
34 changes: 28 additions & 6 deletions src/main/java/org/embulk/input/gcs/GcsFileInput.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,28 +91,50 @@ static FileList listFiles(final PluginTask task) {

// String nextToken = base64Encode(0x0a + ASCII character according to utf8EncodeLength position+ filePath);
static String base64Encode(final String path) {
byte[] lengthVarint;
byte[] encoding;
byte[] utf8 = path.getBytes(StandardCharsets.UTF_8);
LOG.debug("path string: {} ,path length:{} \" + ", path, utf8.length);

int utf8EncodeLength = utf8.length;
if (utf8EncodeLength >= 128) {
if (utf8EncodeLength >= 65_535) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's understandable that this base64Encode method can handle 65536 bytes at most by itself (regardless of the limitation for task.getLastPath()), but it is a little bit confusing for code readers that it has a different limitation from task.getLastPath().

Can you align this with task.getLastPath() and leave a comment to explain why limited by that number?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aligned byte limit to 1024 bytes and added a comment.
db7e7b9

throw new ConfigException(String.format("last_path '%s' is too long to encode. Please try to reduce its length", path));
}

encoding = new byte[utf8.length + 2];
lengthVarint = encodeVarint(utf8EncodeLength);
encoding = new byte[1 + lengthVarint.length + utf8.length];
encoding[0] = 0x0a;

// for example: 60 -> '<'
char temp = (char) utf8EncodeLength;
encoding[1] = (byte) temp;
System.arraycopy(utf8, 0, encoding, 2, utf8.length);
System.arraycopy(lengthVarint, 0, encoding, 1, lengthVarint.length);
System.arraycopy(utf8, 0, encoding, 1 + lengthVarint.length, utf8.length);

final String s = Base64.getEncoder().encodeToString(encoding);
LOG.debug("last_path(base64 encoded): {}", s);
return s;
}

// see: https://protobuf.dev/programming-guides/encoding/#varints
private static byte[] encodeVarint(int value)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that this method is indirectly tested via base64Encode, but for sure and for readability, can you add some more tests directly for this encodeVarint method?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests were added. I use reflect to test private methods, but please point out if this should not be used as repository policy.
9f691e6

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your consideration! I think that the method can be just package-private (no visibility modifier) so that the method is visible from test classes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed to package-private. Thanks for the comment!
dea553b

{
// utf8EncodeLength.length is up to 65535, so 2 bytes are enough for buffer
byte[] buffer = new byte[2];
int pos = 0;
while (true) {
int bits = value & 0x7F;
value >>>= 7;
if (value != 0) {
buffer[pos++] = (byte) (bits | 0x80);
}
else {
buffer[pos++] = (byte) bits;
break;
}
}
byte[] result = new byte[pos];
System.arraycopy(buffer, 0, result, 0, pos);
return result;
}

private static void printBucketInfo(final Storage client, final String bucket) {
// get Bucket
Storage.BucketGetOption fields = Storage.BucketGetOption.fields(
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/org/embulk/input/gcs/GcsFileInputPlugin.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.embulk.input.gcs;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Optional;
import org.embulk.config.ConfigDiff;
Expand Down Expand Up @@ -64,10 +65,10 @@ public ConfigDiff transaction(final ConfigSource config, final FileInputPlugin.C
}
}

// @see https://cloud.google.com/storage/docs/bucket-naming
// @see https://cloud.google.com/storage/docs/objects#naming
if (task.getLastPath().isPresent()) {
if (task.getLastPath().get().length() >= 128) {
throw new ConfigException("last_path length is allowed up to 127 characters");
if (task.getLastPath().get().getBytes(StandardCharsets.UTF_8).length >= 1025) {
throw new ConfigException("last_path length is allowed up to 1024 bytes.");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,8 @@ public void testBase64() {
assertEquals("CgJjMg==", GcsFileInput.base64Encode("c2"));
assertEquals("Cgh0ZXN0LmNzdg==", GcsFileInput.base64Encode("test.csv"));
assertEquals("ChZnY3MtdGVzdC9zYW1wbGVfMDEuY3N2", GcsFileInput.base64Encode("gcs-test/sample_01.csv"));
String params = "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc127";
String expected = "Cn9jY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjMTI3";
String params = "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc200";
String expected = "CsgBY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2MyMDA=";
assertEquals(expected, GcsFileInput.base64Encode(params));

params = "テストダミー/テス123/テストダミー/テストダミ.csv";
Expand Down