compatible with engine.io-parser 1.0.6

2014-06-01 20:21:54 +09:00
parent 7fcd5c5568
commit 7c67fa5d9b
5 changed files with 308 additions and 5 deletions
--- a/pom.xml
+++ b/pom.xml
@@ -81,6 +81,7 @@
        <artifactId>maven-surefire-plugin</artifactId>
        <version>2.14.1</version>
        <configuration>
+          <argLine>-Dfile.encoding=UTF-8</argLine>
          <systemProperties>
            <property>
              <name>java.util.logging.config.file</name>
--- a/src/main/java/com/github/nkzawa/engineio/parser/Parser.java
+++ b/src/main/java/com/github/nkzawa/engineio/parser/Parser.java
@@ -1,8 +1,9 @@
 package com.github.nkzawa.engineio.parser;


+import com.github.nkzawa.utf8.UTF8;
+
 import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -47,7 +48,7 @@ public class Parser {
        String encoded = String.valueOf(packets.get(packet.type));

        if (null != packet.data) {
-            encoded += packet.data;
+            encoded += UTF8.encode(String.valueOf(packet.data));
        }

        @SuppressWarnings("unchecked")
@@ -70,6 +71,7 @@ public class Parser {
        } catch (IndexOutOfBoundsException e) {
            type = -1;
        }
+        data = UTF8.decode(data);

        if (type < 0 || type >= packetslist.size()) {
            return err;
@@ -102,7 +104,7 @@ public class Parser {
                @Override
                public void call(Object packet) {
                    if (packet instanceof String) {
-                        String encodingLength = String.valueOf(((String)packet).getBytes(Charset.forName("UTF-8")).length);
+                        String encodingLength = String.valueOf(((String) packet).length());
                        byte[] sizeBuffer = new byte[encodingLength.length() + 2];

                        sizeBuffer[0] = (byte)0; // is a string
@@ -110,7 +112,7 @@ public class Parser {
                            sizeBuffer[i + 1] = (byte)Character.getNumericValue(encodingLength.charAt(i));
                        }
                        sizeBuffer[sizeBuffer.length - 1] = (byte)255;
-                        results.add(Buffer.concat(new byte[][] {sizeBuffer, ((String)packet).getBytes(Charset.forName("UTF-8"))}));
+                        results.add(Buffer.concat(new byte[][] {sizeBuffer, stringToByteArray((String)packet)}));
                        return;
                    }

@@ -202,7 +204,7 @@ public class Parser {
            byte[] msg = new byte[bufferTail.remaining()];
            bufferTail.get(msg);
            if (isString) {
-                buffers.add(new String(msg, Charset.forName("UTF-8")));
+                buffers.add(byteArrayToString(msg));
            } else {
                buffers.add(msg);
            }
@@ -226,6 +228,22 @@ public class Parser {
        }
    }

+    public static String byteArrayToString(byte[] bytes) {
+        StringBuilder builder = new StringBuilder();
+        for (byte b : bytes) {
+            builder.appendCodePoint(b & 0xFF);
+        }
+        return builder.toString();
+    }
+
+    public static byte[] stringToByteArray(String string) {
+        int len = string.length();
+        byte[] bytes = new byte[len];
+        for (int i = 0; i < len; i++) {
+            bytes[i] = (byte)Character.codePointAt(string, i);
+        }
+        return bytes;
+    }

    public static interface EncodeCallback<T> {

--- a/src/main/java/com/github/nkzawa/utf8/UTF8.java
+++ b/src/main/java/com/github/nkzawa/utf8/UTF8.java
@@ -0,0 +1,164 @@
+package com.github.nkzawa.utf8;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * UTF-8 encoder/decoder ported from utf8.js.
+ *
+ * @see <a href="https://github.com/mathiasbynens/utf8.js">https://github.com/mathiasbynens/utf8.js</a>
+ */
+public class UTF8 {
+
+    private static int[] byteArray;
+    private static int byteCount;
+    private static int byteIndex;
+
+    public static String encode(String string) {
+        int[] codePoints = uc2decode(string);
+        int length = codePoints.length;
+        int index = -1;
+        int codePoint;
+        StringBuilder byteString = new StringBuilder();
+        while (++index < length) {
+            codePoint = codePoints[index];
+            byteString.append(encodeCodePoint(codePoint));
+        }
+        return byteString.toString();
+    }
+
+    public static String decode(String byteString) {
+        byteArray = uc2decode(byteString);
+        byteCount = byteArray.length;
+        byteIndex = 0;
+        List<Integer> codePoints = new ArrayList<Integer>();
+        int tmp;
+        while ((tmp = decodeSymbol()) != -1) {
+            codePoints.add(tmp);
+        }
+        return ucs2encode(listToArray(codePoints));
+    }
+
+    private static int[] uc2decode(String string) {
+        int length = string.length();
+        int[] output = new int[string.codePointCount(0, length)];
+        int counter = 0;
+        int value;
+        for (int i = 0; i < length; i += Character.charCount(value)) {
+            value = string.codePointAt(i);
+            output[counter++] = value;
+        }
+        return output;
+    }
+
+    private static String encodeCodePoint(int codePoint) {
+        StringBuilder symbol = new StringBuilder();
+        if ((codePoint & 0xFFFFFF80) == 0) {
+            return symbol.append(Character.toChars(codePoint)).toString();
+        }
+        if ((codePoint & 0xFFFFF800) == 0) {
+            symbol.append(Character.toChars(((codePoint >> 6) & 0x1F) | 0xC0));
+        } else if ((codePoint & 0xFFFF0000) == 0) {
+            symbol.append(Character.toChars(((codePoint >> 12) & 0x0F) | 0xE0));
+            symbol.append(createByte(codePoint, 6));
+        } else if ((codePoint & 0xFFE00000) == 0) {
+            symbol.append(Character.toChars(((codePoint >> 18) & 0x07) | 0xF0));
+            symbol.append(createByte(codePoint, 12));
+            symbol.append(createByte(codePoint, 6));
+        }
+        symbol.append(Character.toChars((codePoint & 0x3F) | 0x80));
+        return symbol.toString();
+    }
+
+    private static char[] createByte(int codePoint, int shift) {
+        return Character.toChars(((codePoint >> shift) & 0x3F) | 0x80);
+    }
+
+    private static int decodeSymbol() {
+        int byte1;
+        int byte2;
+        int byte3;
+        int byte4;
+        int codePoint;
+
+        if (byteIndex > byteCount) {
+            throw new RuntimeException("Invalid byte index");
+        }
+
+        if (byteIndex == byteCount) {
+            return -1;
+        }
+
+        byte1 = byteArray[byteIndex] & 0xFF;
+        byteIndex++;
+
+        if ((byte1 & 0x80) == 0) {
+            return byte1;
+        }
+
+        if ((byte1 & 0xE0) == 0xC0) {
+            byte2 = readContinuationByte();
+            codePoint = ((byte1 & 0x1F) << 6) | byte2;
+            if (codePoint >= 0x80) {
+                return codePoint;
+            } else {
+                throw new RuntimeException("Invalid continuation byte");
+            }
+        }
+
+        if ((byte1 & 0xF0) == 0xE0) {
+            byte2 = readContinuationByte();
+            byte3 = readContinuationByte();
+            codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
+            if (codePoint >= 0x0800) {
+                return codePoint;
+            } else {
+                throw new RuntimeException("Invalid continuation byte");
+            }
+        }
+
+        if ((byte1 & 0xF8) == 0xF0) {
+            byte2 = readContinuationByte();
+            byte3 = readContinuationByte();
+            byte4 = readContinuationByte();
+            codePoint = ((byte1 & 0x0F) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
+            if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
+                return codePoint;
+            }
+        }
+
+        throw new RuntimeException("Invalid continuation byte");
+    }
+
+    private static int readContinuationByte() {
+        if (byteIndex >= byteCount) {
+            throw new RuntimeException("Invalid byte index");
+        }
+
+        int continuationByte = byteArray[byteIndex] & 0xFF;
+        byteIndex++;
+
+        if ((continuationByte & 0xC0) == 0x80) {
+            return continuationByte & 0x3F;
+        }
+
+        throw new RuntimeException("Invalid continuation byte");
+    }
+
+    private static String ucs2encode(int[] array) {
+        StringBuilder output = new StringBuilder();
+        for (int value : array) {
+            output.appendCodePoint(value);
+        }
+        return output.toString();
+    }
+
+    private static int[] listToArray(List<Integer> list) {
+        int size = list.size();
+        int[] array = new int[size];
+        for (int i = 0; i < size; i++) {
+            array[i] = list.get(i);
+        }
+        return array;
+    }
+}
--- a/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java
+++ b/src/test/java/com/github/nkzawa/engineio/parser/ParserTest.java
@@ -104,6 +104,30 @@ public class ParserTest {
        });
    }

+    @Test
+    public void encodeUTF8SpecialCharsMessagePacket() {
+        encodePacket(new Packet<String>(Packet.MESSAGE, "utf8 — string"), new EncodeCallback<String>() {
+            @Override
+            public void call(String data) {
+                Packet<String> p = decodePacket(data);
+                assertThat(p.type, is(Packet.MESSAGE));
+                assertThat(p.data, is("utf8 — string"));
+            }
+        });
+    }
+
+    @Test
+    public void encodeMessagePacketCoercingToString() {
+        encodePacket(new Packet<Integer>(Packet.MESSAGE, 1), new EncodeCallback<String>() {
+            @Override
+            public void call(String data) {
+                Packet<String> p = decodePacket(data);
+                assertThat(p.type, is(Packet.MESSAGE));
+                assertThat(p.data, is("1"));
+            }
+        });
+    }
+
    @Test
    public void encodeUpgradePacket() {
        encodePacket(new Packet<String>(Packet.UPGRADE), new EncodeCallback<String>() {
--- a/src/test/java/com/github/nkzawa/utf8/UTF8Test.java
+++ b/src/test/java/com/github/nkzawa/utf8/UTF8Test.java
@@ -0,0 +1,96 @@
+package com.github.nkzawa.utf8;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(JUnit4.class)
+public class UTF8Test {
+    private static final Data[] DATA = new Data[] {
+        // 1-byte
+        new Data(0x0000, "\u0000", "\u0000"),
+        new Data(0x005c, "\u005C\u005C", "\u005C\u005C"), // = backslash
+        new Data(0x007f, "\u007F", "\u007F"),
+        // 2-byte
+        new Data(0x0080, "\u0080", "\u00C2\u0080"),
+        new Data(0x05CA, "\u05CA", "\u00D7\u008A"),
+        new Data(0x07FF, "\u07FF", "\u00DF\u00BF"),
+        // 3-byte
+        new Data(0x0800, "\u0800", "\u00E0\u00A0\u0080"),
+        new Data(0x2C3C, "\u2C3C", "\u00E2\u00B0\u00BC"),
+        new Data(0x07FF, "\uFFFF", "\u00EF\u00BF\u00BF"),
+        // unmatched surrogate halves
+        // high surrogates: 0xD800 to 0xDBFF
+        new Data(0xD800, "\uD800", "\u00ED\u00A0\u0080"),
+        new Data("High surrogate followed by another high surrogate",
+                "\uD800\uD800", "\u00ED\u00A0\u0080\u00ED\u00A0\u0080"),
+        new Data("High surrogate followed by a symbol that is not a surrogate",
+                "\uD800A", "\u00ED\u00A0\u0080A"),
+        new Data("Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate",
+                "\uD800\uD834\uDF06\uD800", "\u00ED\u00A0\u0080\u00F0\u009D\u008C\u0086\u00ED\u00A0\u0080"),
+        new Data(0xD9AF, "\uD9AF", "\u00ED\u00A6\u00AF"),
+        new Data(0xDBFF, "\uDBFF", "\u00ED\u00AF\u00BF"),
+        // low surrogates: 0xDC00 to 0xDFFF
+        new Data(0xDC00, "\uDC00", "\u00ED\u00B0\u0080"),
+        new Data("Low surrogate followed by another low surrogate",
+                "\uDC00\uDC00", "\u00ED\u00B0\u0080\u00ED\u00B0\u0080"),
+        new Data("Low surrogate followed by a symbol that is not a surrogate",
+                "\uDC00A", "\u00ED\u00B0\u0080A"),
+        new Data("Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate",
+                "\uDC00\uD834\uDF06\uDC00", "\u00ED\u00B0\u0080\u00F0\u009D\u008C\u0086\u00ED\u00B0\u0080"),
+        new Data(0xDEEE, "\uDEEE", "\u00ED\u00BB\u00AE"),
+        new Data(0xDFFF, "\uDFFF", "\u00ED\u00BF\u00BF"),
+        // 4-byte
+        new Data(0x010000, "\uD800\uDC00", "\u00F0\u0090\u0080\u0080"),
+        new Data(0x01D306, "\uD834\uDF06", "\u00F0\u009D\u008C\u0086"),
+        new Data(0x010FFF, "\uDBFF\uDFFF", "\u00F4\u008F\u00BF\u00BF"),
+    };
+
+    @Rule
+    public ExpectedException exception = ExpectedException.none();
+
+    @Test
+    public void encodeAndDecode() {
+        for (Data data : DATA) {
+            String reason = data.description != null? data.description : "U+" + Integer.toHexString(data.codePoint).toUpperCase();
+            assertThat("Encoding: " + reason, data.encoded, is(UTF8.encode(data.decoded)));
+            assertThat("Decoding: " + reason, data.decoded, is(UTF8.decode(data.encoded)));
+        }
+
+        exception.expect(RuntimeException.class);
+        UTF8.decode("\uFFFF");
+
+        exception.expect(RuntimeException.class);
+        UTF8.decode("\u00E9\u0000\u0000");
+
+        exception.expect(RuntimeException.class);
+        UTF8.decode("\u00C2\uFFFF");
+
+        exception.expect(RuntimeException.class);
+        UTF8.decode("\u00F0\u009D");
+    }
+
+    private static class Data {
+        public int codePoint = -1;
+        public String description;
+        public String decoded;
+        public String encoded;
+
+        public Data(int codePoint, String decoded, String encoded) {
+            this.codePoint = codePoint;
+            this.decoded = decoded;
+            this.encoded = encoded;
+        }
+
+        public Data(String description, String decoded, String encoded) {
+            this.description = description;
+            this.decoded = decoded;
+            this.encoded = encoded;
+        }
+    }
+}