Skip to content

Commit 6b4e415

Browse files
authored
fix utf8 issues in jpostal (#10)
* fix utf8 issues in jpostal * bump version
1 parent 9023e7f commit 6b4e415

File tree

8 files changed

+116
-34
lines changed

8 files changed

+116
-34
lines changed

build.gradle

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,20 @@ plugins {
77
id 'maven-publish'
88
}
99

10-
// The latest release is 1.1.0, but there are many unreleased changes in the master branch
11-
version = '1.2.0'
10+
// The latest libpostal release is 1.1.0, but there are many unreleased changes in the master branch
11+
version = '1.2.1'
1212

1313
repositories {
1414
mavenCentral()
1515
}
1616

17+
tasks.withType(JavaCompile) {
18+
options.encoding = 'UTF-8'
19+
}
20+
test {
21+
systemProperty 'file.encoding', 'UTF-8'
22+
}
23+
1724
dependencies {
1825
testImplementation 'junit:junit:4.+'
1926
implementation 'org.apache.commons:commons-compress:1.26.2'

src/jpostal/c/jpostal_AddressExpander.c

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,17 @@ JNIEXPORT void JNICALL Java_com_mapzen_jpostal_AddressExpander_setupDataDir
2626
}
2727

2828
JNIEXPORT jobjectArray JNICALL Java_com_mapzen_jpostal_AddressExpander_libpostalExpand
29-
(JNIEnv *env, jclass cls, jstring jAddress, jobject jOptions) {
30-
const char *address = (*env)->GetStringUTFChars(env, jAddress, 0);
29+
(JNIEnv *env, jclass cls, jbyteArray jAddress, jobject jOptions) {
30+
jbyte* addressElements = (*env)->GetByteArrayElements(env, jAddress, NULL);
31+
jsize size = (*env)->GetArrayLength(env, jAddress);
32+
char address[size + 1];
33+
34+
for (size_t z = 0; z < size; z++) {
35+
address[z] = addressElements[z];
36+
}
37+
(*env) -> ReleaseByteArrayElements(env, jAddress, addressElements, 0);
38+
39+
address[size] = '\0';
3140

3241
size_t num_expansions = 0;
3342
libpostal_normalize_options_t options = libpostal_get_default_options();
@@ -210,16 +219,17 @@ JNIEXPORT jobjectArray JNICALL Java_com_mapzen_jpostal_AddressExpander_libpostal
210219

211220
char **expansions = libpostal_expand_address((char *)address, options, &num_expansions);
212221

213-
(*env)->ReleaseStringUTFChars(env, jAddress, address);
214222

215223
jobjectArray ret = (jobjectArray)(*env)->NewObjectArray(env,
216224
num_expansions,
217-
(*env)->FindClass(env, "java/lang/String"),
225+
(*env)->FindClass(env, "[B"),
218226
(*env)->NewStringUTF(env, ""));
219227

220228
if (num_expansions > 0) {
221229
for (size_t i = 0; i < num_expansions; i++) {
222-
(*env)->SetObjectArrayElement(env, ret, i, (*env)->NewStringUTF(env, expansions[i]));
230+
jbyteArray bytes = (*env)->NewByteArray(env,strlen(expansions[i]));
231+
(*env)->SetByteArrayRegion(env, bytes, 0, strlen(expansions[i]), (jbyte*) expansions[i]);
232+
(*env)->SetObjectArrayElement(env, ret, i, bytes);
223233
}
224234

225235
}
@@ -409,4 +419,4 @@ JNIEXPORT void JNICALL Java_com_mapzen_jpostal_ExpanderOptions_00024Builder_setD
409419
}
410420

411421
(*env)->SetBooleanField(env, builder, fid, default_options.roman_numerals);
412-
}
422+
}

src/jpostal/c/jpostal_AddressParser.c

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <jni.h>
22
#include <libpostal/libpostal.h>
3+
#include <string.h>
34

45
JNIEXPORT void JNICALL Java_com_mapzen_jpostal_AddressParser_setup
56
(JNIEnv *env, jclass cls) {
@@ -24,9 +25,19 @@ JNIEXPORT void JNICALL Java_com_mapzen_jpostal_AddressParser_setupDataDir
2425
}
2526

2627
JNIEXPORT jobjectArray JNICALL Java_com_mapzen_jpostal_AddressParser_libpostalParse
27-
(JNIEnv *env, jobject thisObj, jstring jAddress, jobject jOptions) {
28+
(JNIEnv *env, jobject thisObj, jbyteArray jAddress, jobject jOptions) {
29+
30+
jbyte* addressElements = (*env)->GetByteArrayElements(env, jAddress, NULL);
31+
jsize size = (*env)->GetArrayLength(env, jAddress);
32+
char address[size + 1];
33+
34+
for (int i = 0; i < size; ++i) {
35+
address[i] = addressElements[i];
36+
}
37+
(*env) -> ReleaseByteArrayElements(env, jAddress, addressElements, 0);
38+
39+
address[size] = '\0';
2840

29-
const char *address = (*env)->GetStringUTFChars(env, jAddress, 0);
3041

3142
libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
3243

@@ -58,8 +69,6 @@ JNIEXPORT jobjectArray JNICALL Java_com_mapzen_jpostal_AddressParser_libpostalPa
5869

5970
libpostal_address_parser_response_t *response = libpostal_parse_address((char *)address, options);
6071

61-
(*env)->ReleaseStringUTFChars(env, jAddress, address);
62-
6372
if (jLanguage != NULL) {
6473
(*env)->ReleaseStringUTFChars(env, jLanguage, 0);
6574
}
@@ -71,7 +80,7 @@ JNIEXPORT jobjectArray JNICALL Java_com_mapzen_jpostal_AddressParser_libpostalPa
7180
jmethodID mid;
7281

7382
jclass parsedComponentClass = (*env)->FindClass(env, "com/mapzen/jpostal/ParsedComponent");
74-
mid = (*env)->GetMethodID(env, parsedComponentClass, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");
83+
mid = (*env)->GetMethodID(env, parsedComponentClass, "<init>", "([BLjava/lang/String;)V");
7584

7685
size_t num_components = response != NULL ? response->num_components : 0;
7786

@@ -82,12 +91,16 @@ JNIEXPORT jobjectArray JNICALL Java_com_mapzen_jpostal_AddressParser_libpostalPa
8291

8392
if (num_components > 0) {
8493
for (size_t i = 0; i < num_components; i++) {
85-
jstring jComponent = (*env)->NewStringUTF(env, response->components[i]);
8694
jstring jLabel = (*env)->NewStringUTF(env, response->labels[i]);
87-
88-
jobject jParsedComponent = (*env)->NewObject(env, parsedComponentClass, mid, jComponent, jLabel);
89-
95+
jbyteArray bytes = (*env)->NewByteArray(env,strlen(response->components[i]));
96+
(*env)->SetByteArrayRegion(env, bytes, 0, strlen(response->components[i]), (jbyte*) response->components[i]);
97+
jobject jParsedComponent = (*env)->NewObject(env, parsedComponentClass, mid, bytes, jLabel);
9098
(*env)->SetObjectArrayElement(env, ret, i, jParsedComponent);
99+
100+
// // These might be necessary to help ensure we're not leaking memory in the cluster.
101+
// (*env)->DeleteLocalRef(env, bytes);
102+
// (*env)->DeleteLocalRef(env, jLabel);
103+
// (*env)->DeleteLocalRef(env, jParsedComponent);
91104
}
92105
}
93106

src/main/java/com/mapzen/jpostal/AddressExpander.java

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,15 @@
22

33
import com.mapzen.jpostal.ExpanderOptions;
44

5+
import java.nio.charset.StandardCharsets;
6+
57
public class AddressExpander {
8+
9+
private static native synchronized void setup();
10+
private static native synchronized void setupDataDir(String dataDir);
11+
private static native synchronized byte[][] libpostalExpand(byte[] address, ExpanderOptions options);
12+
private static native synchronized void teardown();
13+
614
private volatile static AddressExpander instance = null;
715

816
private final LibPostal libPostal;
@@ -13,7 +21,7 @@ public static AddressExpander getInstanceDataDir(String dataDir) {
1321

1422
public static AddressExpander getInstanceConfig(Config config) {
1523
if (instance == null) {
16-
synchronized(AddressParser.class) {
24+
synchronized(AddressExpander.class) {
1725
if (instance == null) {
1826
instance = new AddressExpander(LibPostal.getInstance(config));
1927
}
@@ -32,11 +40,6 @@ public static boolean isInitialized() {
3240
return instance != null;
3341
}
3442

35-
private static native void setup();
36-
private static native void setupDataDir(String dataDir);
37-
private static native String[] libpostalExpand(String address, ExpanderOptions options);
38-
private static native void teardown();
39-
4043
public String[] expandAddress(String address) {
4144
return expandAddressWithOptions(address, new ExpanderOptions.Builder().build());
4245
}
@@ -49,9 +52,12 @@ public String[] expandAddressWithOptions(String address, ExpanderOptions options
4952
throw new NullPointerException("ExpanderOptions options must not be null");
5053
}
5154

52-
synchronized(AddressExpander.class) {
53-
return libpostalExpand(address, options);
55+
byte[][] expansionBytes = libpostalExpand(address.getBytes(), options);
56+
String[] expansions = new String[expansionBytes.length];
57+
for (int i = 0; i < expansionBytes.length; i++) {
58+
expansions[i] = new String(expansionBytes[i], StandardCharsets.UTF_8);
5459
}
60+
return expansions;
5561
}
5662

5763
AddressExpander(final LibPostal libPostal) {

src/main/java/com/mapzen/jpostal/AddressParser.java

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
import com.mapzen.jpostal.ParserOptions;
55

66
public class AddressParser {
7-
private static native void setup();
8-
private static native void setupDataDir(String dataDir);
9-
private native ParsedComponent[] libpostalParse(String address, ParserOptions options);
10-
private static native void teardown();
7+
8+
private static native synchronized void setup();
9+
private static native synchronized void setupDataDir(String dataDir);
10+
private native synchronized ParsedComponent[] libpostalParse(byte[] address, ParserOptions options);
11+
private static native synchronized void teardown();
1112

1213
private volatile static AddressParser instance = null;
1314

@@ -49,10 +50,7 @@ public ParsedComponent[] parseAddressWithOptions(String address, ParserOptions o
4950
if (options == null) {
5051
throw new NullPointerException("ParserOptions options must not be null");
5152
}
52-
53-
synchronized(AddressParser.class) {
54-
return libpostalParse(address, options);
55-
}
53+
return libpostalParse(address.getBytes(), options);
5654
}
5755

5856
AddressParser(final LibPostal libPostal) {

src/main/java/com/mapzen/jpostal/ParsedComponent.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package com.mapzen.jpostal;
22

3+
import java.nio.charset.StandardCharsets;
4+
35
public class ParsedComponent {
46
private String value;
57
private String label;
@@ -24,4 +26,9 @@ public ParsedComponent(String value, String label) {
2426
this.value = value;
2527
this.label = label;
2628
}
29+
30+
public ParsedComponent(byte[] value, String label) {
31+
this.value = new String(value, StandardCharsets.UTF_8);
32+
this.label = label;
33+
}
2734
}

src/test/java/com/mapzen/jpostal/TestAddressExpander.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,19 @@ public void testConfigMismatchLibraryFile() {
9393
);
9494
assertNull(thrown.getCause());
9595
}
96+
97+
@Test()
98+
public void testNulTerminatedExpansion() {
99+
assertTrue(containsExpansion("123 Main St\u0000", "123 main street"));
100+
}
101+
102+
@Test()
103+
public void testAltNulTerminatedExpansion() {
104+
assertTrue(containsExpansion("123 Main St\0", "123 main street"));
105+
}
106+
107+
@Test()
108+
public void test4ByteCharacterExpansion() {
109+
assertTrue(containsExpansion("123 Main St, 𠜎𠜱𠝹𠱓, 😀🤠", "123 main street 𠜎𠜱𠝹𠱓 😀🤠"));
110+
}
96111
}

src/test/java/com/mapzen/jpostal/TestAddressParser.java

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ public void testParseUSAddress() {
5656
);
5757
}
5858

59-
6059
@Test()
6160
public void testConfigMismatchDataDir() {
6261
AddressParser.getInstance();
@@ -86,4 +85,31 @@ public void testConfigMismatchLibraryFile() {
8685
);
8786
assertNull(thrown.getCause());
8887
}
88+
89+
@Test
90+
public void testParseNulTerminatedAddress() {
91+
testParse("Rue du Médecin-Colonel Calbairac Toulouse France\u0000",
92+
new ParsedComponent("rue du médecin-colonel calbairac", "road"),
93+
new ParsedComponent("toulouse", "city"),
94+
new ParsedComponent("france", "country")
95+
);
96+
}
97+
98+
@Test
99+
public void testParseAltNulTerminatedAddress() {
100+
testParse("Rue du Médecin-Colonel Calbairac Toulouse France\0",
101+
new ParsedComponent("rue du médecin-colonel calbairac", "road"),
102+
new ParsedComponent("toulouse", "city"),
103+
new ParsedComponent("france", "country")
104+
);
105+
}
106+
107+
@Test
108+
public void testParse4ByteCharacterAddress() {
109+
testParse("𠜎𠜱𠝹𠱓, 😀🤠, London, UK",
110+
new ParsedComponent("𠜎𠜱𠝹𠱓 😀🤠", "house"),
111+
new ParsedComponent("london", "city"),
112+
new ParsedComponent("uk", "country")
113+
);
114+
}
89115
}

0 commit comments

Comments
 (0)