diff --git a/pom.xml b/pom.xml
index 681b318..745d636 100644
--- a/pom.xml
+++ b/pom.xml
@@ -81,6 +81,7 @@
maven-surefire-plugin
2.14.1
+ -Dfile.encoding=UTF-8
java.util.logging.config.file
diff --git a/src/main/java/com/github/nkzawa/engineio/parser/Parser.java b/src/main/java/com/github/nkzawa/engineio/parser/Parser.java
index 0b15eda..61cbfab 100644
--- a/src/main/java/com/github/nkzawa/engineio/parser/Parser.java
+++ b/src/main/java/com/github/nkzawa/engineio/parser/Parser.java
@@ -1,8 +1,9 @@
package com.github.nkzawa.engineio.parser;
+import com.github.nkzawa.utf8.UTF8;
+
import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -47,7 +48,7 @@ public class Parser {
String encoded = String.valueOf(packets.get(packet.type));
if (null != packet.data) {
- encoded += packet.data;
+ encoded += UTF8.encode(String.valueOf(packet.data));
}
@SuppressWarnings("unchecked")
@@ -70,6 +71,7 @@ public class Parser {
} catch (IndexOutOfBoundsException e) {
type = -1;
}
+ data = UTF8.decode(data);
if (type < 0 || type >= packetslist.size()) {
return err;
@@ -102,7 +104,7 @@ public class Parser {
@Override
public void call(Object packet) {
if (packet instanceof String) {
- String encodingLength = String.valueOf(((String)packet).getBytes(Charset.forName("UTF-8")).length);
+ String encodingLength = String.valueOf(((String) packet).length());
byte[] sizeBuffer = new byte[encodingLength.length() + 2];
sizeBuffer[0] = (byte)0; // is a string
@@ -110,7 +112,7 @@ public class Parser {
sizeBuffer[i + 1] = (byte)Character.getNumericValue(encodingLength.charAt(i));
}
sizeBuffer[sizeBuffer.length - 1] = (byte)255;
- results.add(Buffer.concat(new byte[][] {sizeBuffer, ((String)packet).getBytes(Charset.forName("UTF-8"))}));
+ results.add(Buffer.concat(new byte[][] {sizeBuffer, stringToByteArray((String)packet)}));
return;
}
@@ -202,7 +204,7 @@ public class Parser {
byte[] msg = new byte[bufferTail.remaining()];
bufferTail.get(msg);
if (isString) {
- buffers.add(new String(msg, Charset.forName("UTF-8")));
+ buffers.add(byteArrayToString(msg));
} else {
buffers.add(msg);
}
@@ -226,6 +228,22 @@ public class Parser {
}
}
+ public static String byteArrayToString(byte[] bytes) {
+ StringBuilder builder = new StringBuilder();
+ for (byte b : bytes) {
+ builder.appendCodePoint(b & 0xFF);
+ }
+ return builder.toString();
+ }
+
+ public static byte[] stringToByteArray(String string) {
+ int len = string.length();
+ byte[] bytes = new byte[len];
+ for (int i = 0; i < len; i++) {
+ bytes[i] = (byte)Character.codePointAt(string, i);
+ }
+ return bytes;
+ }
public static interface EncodeCallback {
diff --git a/src/main/java/com/github/nkzawa/utf8/UTF8.java b/src/main/java/com/github/nkzawa/utf8/UTF8.java
new file mode 100644
index 0000000..331e1cc
--- /dev/null
+++ b/src/main/java/com/github/nkzawa/utf8/UTF8.java
@@ -0,0 +1,164 @@
+package com.github.nkzawa.utf8;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * UTF-8 encoder/decoder ported from utf8.js.
+ *
+ * @see https://github.com/mathiasbynens/utf8.js
+ */
+public class UTF8 {
+
+ private static int[] byteArray;
+ private static int byteCount;
+ private static int byteIndex;
+
+ public static String encode(String string) {
+ int[] codePoints = uc2decode(string);
+ int length = codePoints.length;
+ int index = -1;
+ int codePoint;
+ StringBuilder byteString = new StringBuilder();
+ while (++index < length) {
+ codePoint = codePoints[index];
+ byteString.append(encodeCodePoint(codePoint));
+ }
+ return byteString.toString();
+ }
+
+ public static String decode(String byteString) {
+ byteArray = uc2decode(byteString);
+ byteCount = byteArray.length;
+ byteIndex = 0;
+ List codePoints = new ArrayList();
+ int tmp;
+ while ((tmp = decodeSymbol()) != -1) {
+ codePoints.add(tmp);
+ }
+ return ucs2encode(listToArray(codePoints));
+ }
+
+ private static int[] uc2decode(String string) {
+ int length = string.length();
+ int[] output = new int[string.codePointCount(0, length)];
+ int counter = 0;
+ int value;
+ for (int i = 0; i < length; i += Character.charCount(value)) {
+ value = string.codePointAt(i);
+ output[counter++] = value;
+ }
+ return output;
+ }
+
+ private static String encodeCodePoint(int codePoint) {
+ StringBuilder symbol = new StringBuilder();
+ if ((codePoint & 0xFFFFFF80) == 0) {
+ return symbol.append(Character.toChars(codePoint)).toString();
+ }
+ if ((codePoint & 0xFFFFF800) == 0) {
+ symbol.append(Character.toChars(((codePoint >> 6) & 0x1F) | 0xC0));
+ } else if ((codePoint & 0xFFFF0000) == 0) {
+ symbol.append(Character.toChars(((codePoint >> 12) & 0x0F) | 0xE0));
+ symbol.append(createByte(codePoint, 6));
+ } else if ((codePoint & 0xFFE00000) == 0) {
+ symbol.append(Character.toChars(((codePoint >> 18) & 0x07) | 0xF0));
+ symbol.append(createByte(codePoint, 12));
+ symbol.append(createByte(codePoint, 6));
+ }
+ symbol.append(Character.toChars((codePoint & 0x3F) | 0x80));
+ return symbol.toString();
+ }
+
+ private static char[] createByte(int codePoint, int shift) {
+ return Character.toChars(((codePoint >> shift) & 0x3F) | 0x80);
+ }
+
+ private static int decodeSymbol() {
+ int byte1;
+ int byte2;
+ int byte3;
+ int byte4;
+ int codePoint;
+
+ if (byteIndex > byteCount) {
+ throw new RuntimeException("Invalid byte index");
+ }
+
+ if (byteIndex == byteCount) {
+ return -1;
+ }
+
+ byte1 = byteArray[byteIndex] & 0xFF;
+ byteIndex++;
+
+ if ((byte1 & 0x80) == 0) {
+ return byte1;
+ }
+
+ if ((byte1 & 0xE0) == 0xC0) {
+ byte2 = readContinuationByte();
+ codePoint = ((byte1 & 0x1F) << 6) | byte2;
+ if (codePoint >= 0x80) {
+ return codePoint;
+ } else {
+ throw new RuntimeException("Invalid continuation byte");
+ }
+ }
+
+ if ((byte1 & 0xF0) == 0xE0) {
+ byte2 = readContinuationByte();
+ byte3 = readContinuationByte();
+ codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
+ if (codePoint >= 0x0800) {
+ return codePoint;
+ } else {
+ throw new RuntimeException("Invalid continuation byte");
+ }
+ }
+
+ if ((byte1 & 0xF8) == 0xF0) {
+ byte2 = readContinuationByte();
+ byte3 = readContinuationByte();
+ byte4 = readContinuationByte();
+ codePoint = ((byte1 & 0x0F) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
+ if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
+ return codePoint;
+ }
+ }
+
+ throw new RuntimeException("Invalid continuation byte");
+ }
+
+ private static int readContinuationByte() {
+ if (byteIndex >= byteCount) {
+ throw new RuntimeException("Invalid byte index");
+ }
+
+ int continuationByte = byteArray[byteIndex] & 0xFF;
+ byteIndex++;
+
+ if ((continuationByte & 0xC0) == 0x80) {
+ return continuationByte & 0x3F;
+ }
+
+ throw new RuntimeException("Invalid continuation byte");
+ }
+
+ private static String ucs2encode(int[] array) {
+ StringBuilder output = new StringBuilder();
+ for (int value : array) {
+ output.appendCodePoint(value);
+ }
+ return output.toString();
+ }
+
+ private static int[] listToArray(List list) {
+ int size = list.size();
+ int[] array = new int[size];
+ for (int i = 0; i < size; i++) {
+ array[i] = list.get(i);
+ }
+ return array;
+ }
+}
diff --git a/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java b/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java
index df12dcc..26d0d53 100644
--- a/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java
+++ b/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java
@@ -104,6 +104,30 @@ public class ParserTest {
});
}
+ @Test
+ public void encodeUTF8SpecialCharsMessagePacket() {
+ encodePacket(new Packet(Packet.MESSAGE, "utf8 — string"), new EncodeCallback() {
+ @Override
+ public void call(String data) {
+ Packet p = decodePacket(data);
+ assertThat(p.type, is(Packet.MESSAGE));
+ assertThat(p.data, is("utf8 — string"));
+ }
+ });
+ }
+
+ @Test
+ public void encodeMessagePacketCoercingToString() {
+ encodePacket(new Packet(Packet.MESSAGE, 1), new EncodeCallback() {
+ @Override
+ public void call(String data) {
+ Packet p = decodePacket(data);
+ assertThat(p.type, is(Packet.MESSAGE));
+ assertThat(p.data, is("1"));
+ }
+ });
+ }
+
@Test
public void encodeUpgradePacket() {
encodePacket(new Packet(Packet.UPGRADE), new EncodeCallback() {
diff --git a/src/test/java/com/github/nkzawa/utf8/UTF8Test.java b/src/test/java/com/github/nkzawa/utf8/UTF8Test.java
new file mode 100644
index 0000000..c9f26a9
--- /dev/null
+++ b/src/test/java/com/github/nkzawa/utf8/UTF8Test.java
@@ -0,0 +1,96 @@
+package com.github.nkzawa.utf8;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(JUnit4.class)
+public class UTF8Test {
+ private static final Data[] DATA = new Data[] {
+ // 1-byte
+ new Data(0x0000, "\u0000", "\u0000"),
+ new Data(0x005c, "\u005C\u005C", "\u005C\u005C"), // = backslash
+ new Data(0x007f, "\u007F", "\u007F"),
+ // 2-byte
+ new Data(0x0080, "\u0080", "\u00C2\u0080"),
+ new Data(0x05CA, "\u05CA", "\u00D7\u008A"),
+ new Data(0x07FF, "\u07FF", "\u00DF\u00BF"),
+ // 3-byte
+ new Data(0x0800, "\u0800", "\u00E0\u00A0\u0080"),
+ new Data(0x2C3C, "\u2C3C", "\u00E2\u00B0\u00BC"),
+ new Data(0x07FF, "\uFFFF", "\u00EF\u00BF\u00BF"),
+ // unmatched surrogate halves
+ // high surrogates: 0xD800 to 0xDBFF
+ new Data(0xD800, "\uD800", "\u00ED\u00A0\u0080"),
+ new Data("High surrogate followed by another high surrogate",
+ "\uD800\uD800", "\u00ED\u00A0\u0080\u00ED\u00A0\u0080"),
+ new Data("High surrogate followed by a symbol that is not a surrogate",
+ "\uD800A", "\u00ED\u00A0\u0080A"),
+ new Data("Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate",
+ "\uD800\uD834\uDF06\uD800", "\u00ED\u00A0\u0080\u00F0\u009D\u008C\u0086\u00ED\u00A0\u0080"),
+ new Data(0xD9AF, "\uD9AF", "\u00ED\u00A6\u00AF"),
+ new Data(0xDBFF, "\uDBFF", "\u00ED\u00AF\u00BF"),
+ // low surrogates: 0xDC00 to 0xDFFF
+ new Data(0xDC00, "\uDC00", "\u00ED\u00B0\u0080"),
+ new Data("Low surrogate followed by another low surrogate",
+ "\uDC00\uDC00", "\u00ED\u00B0\u0080\u00ED\u00B0\u0080"),
+ new Data("Low surrogate followed by a symbol that is not a surrogate",
+ "\uDC00A", "\u00ED\u00B0\u0080A"),
+ new Data("Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate",
+ "\uDC00\uD834\uDF06\uDC00", "\u00ED\u00B0\u0080\u00F0\u009D\u008C\u0086\u00ED\u00B0\u0080"),
+ new Data(0xDEEE, "\uDEEE", "\u00ED\u00BB\u00AE"),
+ new Data(0xDFFF, "\uDFFF", "\u00ED\u00BF\u00BF"),
+ // 4-byte
+ new Data(0x010000, "\uD800\uDC00", "\u00F0\u0090\u0080\u0080"),
+ new Data(0x01D306, "\uD834\uDF06", "\u00F0\u009D\u008C\u0086"),
+ new Data(0x010FFF, "\uDBFF\uDFFF", "\u00F4\u008F\u00BF\u00BF"),
+ };
+
+ @Rule
+ public ExpectedException exception = ExpectedException.none();
+
+ @Test
+ public void encodeAndDecode() {
+ for (Data data : DATA) {
+ String reason = data.description != null? data.description : "U+" + Integer.toHexString(data.codePoint).toUpperCase();
+ assertThat("Encoding: " + reason, data.encoded, is(UTF8.encode(data.decoded)));
+ assertThat("Decoding: " + reason, data.decoded, is(UTF8.decode(data.encoded)));
+ }
+
+ exception.expect(RuntimeException.class);
+ UTF8.decode("\uFFFF");
+
+ exception.expect(RuntimeException.class);
+ UTF8.decode("\u00E9\u0000\u0000");
+
+ exception.expect(RuntimeException.class);
+ UTF8.decode("\u00C2\uFFFF");
+
+ exception.expect(RuntimeException.class);
+ UTF8.decode("\u00F0\u009D");
+ }
+
+ private static class Data {
+ public int codePoint = -1;
+ public String description;
+ public String decoded;
+ public String encoded;
+
+ public Data(int codePoint, String decoded, String encoded) {
+ this.codePoint = codePoint;
+ this.decoded = decoded;
+ this.encoded = encoded;
+ }
+
+ public Data(String description, String decoded, String encoded) {
+ this.description = description;
+ this.decoded = decoded;
+ this.encoded = encoded;
+ }
+ }
+}