From 8b07bbd9f07ddcc9a385441d9f79f62473cefcfc Mon Sep 17 00:00:00 2001 From: nkzawa Date: Wed, 12 Jul 2017 12:31:41 +0900 Subject: [PATCH] utf8: add strict option --- src/main/java/io/socket/utf8/UTF8.java | 45 +++++++++++++++++++------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/main/java/io/socket/utf8/UTF8.java b/src/main/java/io/socket/utf8/UTF8.java index 511d478..e714579 100644 --- a/src/main/java/io/socket/utf8/UTF8.java +++ b/src/main/java/io/socket/utf8/UTF8.java @@ -18,6 +18,12 @@ public final class UTF8 { private UTF8 () {} public static String encode(String string) throws UTF8Exception { + return encode(string, new Options()); + } + + public static String encode(String string, Options opts) throws UTF8Exception { + boolean strict = opts.strict; + int[] codePoints = ucs2decode(string); int length = codePoints.length; int index = -1; @@ -25,18 +31,24 @@ public final class UTF8 { StringBuilder byteString = new StringBuilder(); while (++index < length) { codePoint = codePoints[index]; - byteString.append(encodeCodePoint(codePoint)); + byteString.append(encodeCodePoint(codePoint, strict)); } return byteString.toString(); } public static String decode(String byteString) throws UTF8Exception { + return decode(byteString, new Options()); + } + + public static String decode(String byteString, Options opts) throws UTF8Exception { + boolean strict = opts.strict; + byteArray = ucs2decode(byteString); byteCount = byteArray.length; byteIndex = 0; List codePoints = new ArrayList(); int tmp; - while ((tmp = decodeSymbol()) != -1) { + while ((tmp = decodeSymbol(strict)) != -1) { codePoints.add(tmp); } return ucs2encode(listToArray(codePoints)); @@ -54,7 +66,7 @@ public final class UTF8 { return output; } - private static String encodeCodePoint(int codePoint) throws UTF8Exception { + private static String encodeCodePoint(int codePoint, boolean strict) throws UTF8Exception { StringBuilder symbol = new StringBuilder(); if ((codePoint & 0xFFFFFF80) == 0) { return symbol.append(Character.toChars(codePoint)).toString(); @@ -62,7 +74,9 @@ public final class UTF8 { if ((codePoint & 0xFFFFF800) == 0) { symbol.append(Character.toChars(((codePoint >> 6) & 0x1F) | 0xC0)); } else if ((codePoint & 0xFFFF0000) == 0) { - checkScalarValue(codePoint); + if (!checkScalarValue(codePoint, strict)) { + codePoint = 0xFFFD; + } symbol.append(Character.toChars(((codePoint >> 12) & 0x0F) | 0xE0)); symbol.append(createByte(codePoint, 6)); } else if ((codePoint & 0xFFE00000) == 0) { @@ -78,7 +92,7 @@ public final class UTF8 { return Character.toChars(((codePoint >> shift) & 0x3F) | 0x80); } - private static int decodeSymbol() throws UTF8Exception { + private static int decodeSymbol(boolean strict) throws UTF8Exception { int byte1; int byte2; int byte3; @@ -115,8 +129,7 @@ public final class UTF8 { byte3 = readContinuationByte(); codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; if (codePoint >= 0x0800) { - checkScalarValue(codePoint); - return codePoint; + return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD; } else { throw new UTF8Exception(INVALID_CONTINUATION_BYTE); } @@ -158,13 +171,17 @@ public final class UTF8 { return output.toString(); } - private static void checkScalarValue(int codePoint) throws UTF8Exception { + private static boolean checkScalarValue(int codePoint, boolean strict) throws UTF8Exception { if (codePoint >= 0xD800 && codePoint <= 0xDFFF) { - throw new UTF8Exception( - "Lone surrogate U+" + Integer.toHexString(codePoint).toUpperCase() + - " is not a scalar value" - ); + if (strict) { + throw new UTF8Exception( + "Lone surrogate U+" + Integer.toHexString(codePoint).toUpperCase() + + " is not a scalar value" + ); + } + return false; } + return true; } private static int[] listToArray(List list) { @@ -175,4 +192,8 @@ public final class UTF8 { } return array; } + + public static class Options { + public boolean strict = true; + } }