From 7c67fa5d9bb546a5918d74cf2906b4f3add57373 Mon Sep 17 00:00:00 2001 From: Naoyuki Kanezawa Date: Sun, 1 Jun 2014 20:21:54 +0900 Subject: [PATCH] compatible with engine.io-parser 1.0.6 --- pom.xml | 1 + .../github/nkzawa/engineio/parser/Parser.java | 28 ++- .../java/com/github/nkzawa/utf8/UTF8.java | 164 ++++++++++++++++++ .../nkzawa/engineio/parser/ParserTest.java | 24 +++ .../java/com/github/nkzawa/utf8/UTF8Test.java | 96 ++++++++++ 5 files changed, 308 insertions(+), 5 deletions(-) create mode 100644 src/main/java/com/github/nkzawa/utf8/UTF8.java create mode 100644 src/test/java/com/github/nkzawa/utf8/UTF8Test.java diff --git a/pom.xml b/pom.xml index 681b318..745d636 100644 --- a/pom.xml +++ b/pom.xml @@ -81,6 +81,7 @@ maven-surefire-plugin 2.14.1 + -Dfile.encoding=UTF-8 java.util.logging.config.file diff --git a/src/main/java/com/github/nkzawa/engineio/parser/Parser.java b/src/main/java/com/github/nkzawa/engineio/parser/Parser.java index 0b15eda..61cbfab 100644 --- a/src/main/java/com/github/nkzawa/engineio/parser/Parser.java +++ b/src/main/java/com/github/nkzawa/engineio/parser/Parser.java @@ -1,8 +1,9 @@ package com.github.nkzawa.engineio.parser; +import com.github.nkzawa.utf8.UTF8; + import java.nio.ByteBuffer; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -47,7 +48,7 @@ public class Parser { String encoded = String.valueOf(packets.get(packet.type)); if (null != packet.data) { - encoded += packet.data; + encoded += UTF8.encode(String.valueOf(packet.data)); } @SuppressWarnings("unchecked") @@ -70,6 +71,7 @@ public class Parser { } catch (IndexOutOfBoundsException e) { type = -1; } + data = UTF8.decode(data); if (type < 0 || type >= packetslist.size()) { return err; @@ -102,7 +104,7 @@ public class Parser { @Override public void call(Object packet) { if (packet instanceof String) { - String encodingLength = String.valueOf(((String)packet).getBytes(Charset.forName("UTF-8")).length); + String encodingLength = String.valueOf(((String) packet).length()); byte[] sizeBuffer = new byte[encodingLength.length() + 2]; sizeBuffer[0] = (byte)0; // is a string @@ -110,7 +112,7 @@ public class Parser { sizeBuffer[i + 1] = (byte)Character.getNumericValue(encodingLength.charAt(i)); } sizeBuffer[sizeBuffer.length - 1] = (byte)255; - results.add(Buffer.concat(new byte[][] {sizeBuffer, ((String)packet).getBytes(Charset.forName("UTF-8"))})); + results.add(Buffer.concat(new byte[][] {sizeBuffer, stringToByteArray((String)packet)})); return; } @@ -202,7 +204,7 @@ public class Parser { byte[] msg = new byte[bufferTail.remaining()]; bufferTail.get(msg); if (isString) { - buffers.add(new String(msg, Charset.forName("UTF-8"))); + buffers.add(byteArrayToString(msg)); } else { buffers.add(msg); } @@ -226,6 +228,22 @@ public class Parser { } } + public static String byteArrayToString(byte[] bytes) { + StringBuilder builder = new StringBuilder(); + for (byte b : bytes) { + builder.appendCodePoint(b & 0xFF); + } + return builder.toString(); + } + + public static byte[] stringToByteArray(String string) { + int len = string.length(); + byte[] bytes = new byte[len]; + for (int i = 0; i < len; i++) { + bytes[i] = (byte)Character.codePointAt(string, i); + } + return bytes; + } public static interface EncodeCallback { diff --git a/src/main/java/com/github/nkzawa/utf8/UTF8.java b/src/main/java/com/github/nkzawa/utf8/UTF8.java new file mode 100644 index 0000000..331e1cc --- /dev/null +++ b/src/main/java/com/github/nkzawa/utf8/UTF8.java @@ -0,0 +1,164 @@ +package com.github.nkzawa.utf8; + +import java.util.ArrayList; +import java.util.List; + +/** + * UTF-8 encoder/decoder ported from utf8.js. + * + * @see https://github.com/mathiasbynens/utf8.js + */ +public class UTF8 { + + private static int[] byteArray; + private static int byteCount; + private static int byteIndex; + + public static String encode(String string) { + int[] codePoints = uc2decode(string); + int length = codePoints.length; + int index = -1; + int codePoint; + StringBuilder byteString = new StringBuilder(); + while (++index < length) { + codePoint = codePoints[index]; + byteString.append(encodeCodePoint(codePoint)); + } + return byteString.toString(); + } + + public static String decode(String byteString) { + byteArray = uc2decode(byteString); + byteCount = byteArray.length; + byteIndex = 0; + List codePoints = new ArrayList(); + int tmp; + while ((tmp = decodeSymbol()) != -1) { + codePoints.add(tmp); + } + return ucs2encode(listToArray(codePoints)); + } + + private static int[] uc2decode(String string) { + int length = string.length(); + int[] output = new int[string.codePointCount(0, length)]; + int counter = 0; + int value; + for (int i = 0; i < length; i += Character.charCount(value)) { + value = string.codePointAt(i); + output[counter++] = value; + } + return output; + } + + private static String encodeCodePoint(int codePoint) { + StringBuilder symbol = new StringBuilder(); + if ((codePoint & 0xFFFFFF80) == 0) { + return symbol.append(Character.toChars(codePoint)).toString(); + } + if ((codePoint & 0xFFFFF800) == 0) { + symbol.append(Character.toChars(((codePoint >> 6) & 0x1F) | 0xC0)); + } else if ((codePoint & 0xFFFF0000) == 0) { + symbol.append(Character.toChars(((codePoint >> 12) & 0x0F) | 0xE0)); + symbol.append(createByte(codePoint, 6)); + } else if ((codePoint & 0xFFE00000) == 0) { + symbol.append(Character.toChars(((codePoint >> 18) & 0x07) | 0xF0)); + symbol.append(createByte(codePoint, 12)); + symbol.append(createByte(codePoint, 6)); + } + symbol.append(Character.toChars((codePoint & 0x3F) | 0x80)); + return symbol.toString(); + } + + private static char[] createByte(int codePoint, int shift) { + return Character.toChars(((codePoint >> shift) & 0x3F) | 0x80); + } + + private static int decodeSymbol() { + int byte1; + int byte2; + int byte3; + int byte4; + int codePoint; + + if (byteIndex > byteCount) { + throw new RuntimeException("Invalid byte index"); + } + + if (byteIndex == byteCount) { + return -1; + } + + byte1 = byteArray[byteIndex] & 0xFF; + byteIndex++; + + if ((byte1 & 0x80) == 0) { + return byte1; + } + + if ((byte1 & 0xE0) == 0xC0) { + byte2 = readContinuationByte(); + codePoint = ((byte1 & 0x1F) << 6) | byte2; + if (codePoint >= 0x80) { + return codePoint; + } else { + throw new RuntimeException("Invalid continuation byte"); + } + } + + if ((byte1 & 0xF0) == 0xE0) { + byte2 = readContinuationByte(); + byte3 = readContinuationByte(); + codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; + if (codePoint >= 0x0800) { + return codePoint; + } else { + throw new RuntimeException("Invalid continuation byte"); + } + } + + if ((byte1 & 0xF8) == 0xF0) { + byte2 = readContinuationByte(); + byte3 = readContinuationByte(); + byte4 = readContinuationByte(); + codePoint = ((byte1 & 0x0F) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; + if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { + return codePoint; + } + } + + throw new RuntimeException("Invalid continuation byte"); + } + + private static int readContinuationByte() { + if (byteIndex >= byteCount) { + throw new RuntimeException("Invalid byte index"); + } + + int continuationByte = byteArray[byteIndex] & 0xFF; + byteIndex++; + + if ((continuationByte & 0xC0) == 0x80) { + return continuationByte & 0x3F; + } + + throw new RuntimeException("Invalid continuation byte"); + } + + private static String ucs2encode(int[] array) { + StringBuilder output = new StringBuilder(); + for (int value : array) { + output.appendCodePoint(value); + } + return output.toString(); + } + + private static int[] listToArray(List list) { + int size = list.size(); + int[] array = new int[size]; + for (int i = 0; i < size; i++) { + array[i] = list.get(i); + } + return array; + } +} diff --git a/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java b/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java index df12dcc..26d0d53 100644 --- a/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java +++ b/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java @@ -104,6 +104,30 @@ public class ParserTest { }); } + @Test + public void encodeUTF8SpecialCharsMessagePacket() { + encodePacket(new Packet(Packet.MESSAGE, "utf8 — string"), new EncodeCallback() { + @Override + public void call(String data) { + Packet p = decodePacket(data); + assertThat(p.type, is(Packet.MESSAGE)); + assertThat(p.data, is("utf8 — string")); + } + }); + } + + @Test + public void encodeMessagePacketCoercingToString() { + encodePacket(new Packet(Packet.MESSAGE, 1), new EncodeCallback() { + @Override + public void call(String data) { + Packet p = decodePacket(data); + assertThat(p.type, is(Packet.MESSAGE)); + assertThat(p.data, is("1")); + } + }); + } + @Test public void encodeUpgradePacket() { encodePacket(new Packet(Packet.UPGRADE), new EncodeCallback() { diff --git a/src/test/java/com/github/nkzawa/utf8/UTF8Test.java b/src/test/java/com/github/nkzawa/utf8/UTF8Test.java new file mode 100644 index 0000000..c9f26a9 --- /dev/null +++ b/src/test/java/com/github/nkzawa/utf8/UTF8Test.java @@ -0,0 +1,96 @@ +package com.github.nkzawa.utf8; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + +@RunWith(JUnit4.class) +public class UTF8Test { + private static final Data[] DATA = new Data[] { + // 1-byte + new Data(0x0000, "\u0000", "\u0000"), + new Data(0x005c, "\u005C\u005C", "\u005C\u005C"), // = backslash + new Data(0x007f, "\u007F", "\u007F"), + // 2-byte + new Data(0x0080, "\u0080", "\u00C2\u0080"), + new Data(0x05CA, "\u05CA", "\u00D7\u008A"), + new Data(0x07FF, "\u07FF", "\u00DF\u00BF"), + // 3-byte + new Data(0x0800, "\u0800", "\u00E0\u00A0\u0080"), + new Data(0x2C3C, "\u2C3C", "\u00E2\u00B0\u00BC"), + new Data(0x07FF, "\uFFFF", "\u00EF\u00BF\u00BF"), + // unmatched surrogate halves + // high surrogates: 0xD800 to 0xDBFF + new Data(0xD800, "\uD800", "\u00ED\u00A0\u0080"), + new Data("High surrogate followed by another high surrogate", + "\uD800\uD800", "\u00ED\u00A0\u0080\u00ED\u00A0\u0080"), + new Data("High surrogate followed by a symbol that is not a surrogate", + "\uD800A", "\u00ED\u00A0\u0080A"), + new Data("Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate", + "\uD800\uD834\uDF06\uD800", "\u00ED\u00A0\u0080\u00F0\u009D\u008C\u0086\u00ED\u00A0\u0080"), + new Data(0xD9AF, "\uD9AF", "\u00ED\u00A6\u00AF"), + new Data(0xDBFF, "\uDBFF", "\u00ED\u00AF\u00BF"), + // low surrogates: 0xDC00 to 0xDFFF + new Data(0xDC00, "\uDC00", "\u00ED\u00B0\u0080"), + new Data("Low surrogate followed by another low surrogate", + "\uDC00\uDC00", "\u00ED\u00B0\u0080\u00ED\u00B0\u0080"), + new Data("Low surrogate followed by a symbol that is not a surrogate", + "\uDC00A", "\u00ED\u00B0\u0080A"), + new Data("Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate", + "\uDC00\uD834\uDF06\uDC00", "\u00ED\u00B0\u0080\u00F0\u009D\u008C\u0086\u00ED\u00B0\u0080"), + new Data(0xDEEE, "\uDEEE", "\u00ED\u00BB\u00AE"), + new Data(0xDFFF, "\uDFFF", "\u00ED\u00BF\u00BF"), + // 4-byte + new Data(0x010000, "\uD800\uDC00", "\u00F0\u0090\u0080\u0080"), + new Data(0x01D306, "\uD834\uDF06", "\u00F0\u009D\u008C\u0086"), + new Data(0x010FFF, "\uDBFF\uDFFF", "\u00F4\u008F\u00BF\u00BF"), + }; + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void encodeAndDecode() { + for (Data data : DATA) { + String reason = data.description != null? data.description : "U+" + Integer.toHexString(data.codePoint).toUpperCase(); + assertThat("Encoding: " + reason, data.encoded, is(UTF8.encode(data.decoded))); + assertThat("Decoding: " + reason, data.decoded, is(UTF8.decode(data.encoded))); + } + + exception.expect(RuntimeException.class); + UTF8.decode("\uFFFF"); + + exception.expect(RuntimeException.class); + UTF8.decode("\u00E9\u0000\u0000"); + + exception.expect(RuntimeException.class); + UTF8.decode("\u00C2\uFFFF"); + + exception.expect(RuntimeException.class); + UTF8.decode("\u00F0\u009D"); + } + + private static class Data { + public int codePoint = -1; + public String description; + public String decoded; + public String encoded; + + public Data(int codePoint, String decoded, String encoded) { + this.codePoint = codePoint; + this.decoded = decoded; + this.encoded = encoded; + } + + public Data(String description, String decoded, String encoded) { + this.description = description; + this.decoded = decoded; + this.encoded = encoded; + } + } +}