utf8: add strict option

This commit is contained in:
nkzawa
2017-07-12 12:31:41 +09:00
parent e4f1a56cfc
commit 8b07bbd9f0

View File

@@ -18,6 +18,12 @@ public final class UTF8 {
private UTF8 () {} private UTF8 () {}
public static String encode(String string) throws UTF8Exception { public static String encode(String string) throws UTF8Exception {
return encode(string, new Options());
}
public static String encode(String string, Options opts) throws UTF8Exception {
boolean strict = opts.strict;
int[] codePoints = ucs2decode(string); int[] codePoints = ucs2decode(string);
int length = codePoints.length; int length = codePoints.length;
int index = -1; int index = -1;
@@ -25,18 +31,24 @@ public final class UTF8 {
StringBuilder byteString = new StringBuilder(); StringBuilder byteString = new StringBuilder();
while (++index < length) { while (++index < length) {
codePoint = codePoints[index]; codePoint = codePoints[index];
byteString.append(encodeCodePoint(codePoint)); byteString.append(encodeCodePoint(codePoint, strict));
} }
return byteString.toString(); return byteString.toString();
} }
public static String decode(String byteString) throws UTF8Exception { public static String decode(String byteString) throws UTF8Exception {
return decode(byteString, new Options());
}
public static String decode(String byteString, Options opts) throws UTF8Exception {
boolean strict = opts.strict;
byteArray = ucs2decode(byteString); byteArray = ucs2decode(byteString);
byteCount = byteArray.length; byteCount = byteArray.length;
byteIndex = 0; byteIndex = 0;
List<Integer> codePoints = new ArrayList<Integer>(); List<Integer> codePoints = new ArrayList<Integer>();
int tmp; int tmp;
while ((tmp = decodeSymbol()) != -1) { while ((tmp = decodeSymbol(strict)) != -1) {
codePoints.add(tmp); codePoints.add(tmp);
} }
return ucs2encode(listToArray(codePoints)); return ucs2encode(listToArray(codePoints));
@@ -54,7 +66,7 @@ public final class UTF8 {
return output; return output;
} }
private static String encodeCodePoint(int codePoint) throws UTF8Exception { private static String encodeCodePoint(int codePoint, boolean strict) throws UTF8Exception {
StringBuilder symbol = new StringBuilder(); StringBuilder symbol = new StringBuilder();
if ((codePoint & 0xFFFFFF80) == 0) { if ((codePoint & 0xFFFFFF80) == 0) {
return symbol.append(Character.toChars(codePoint)).toString(); return symbol.append(Character.toChars(codePoint)).toString();
@@ -62,7 +74,9 @@ public final class UTF8 {
if ((codePoint & 0xFFFFF800) == 0) { if ((codePoint & 0xFFFFF800) == 0) {
symbol.append(Character.toChars(((codePoint >> 6) & 0x1F) | 0xC0)); symbol.append(Character.toChars(((codePoint >> 6) & 0x1F) | 0xC0));
} else if ((codePoint & 0xFFFF0000) == 0) { } else if ((codePoint & 0xFFFF0000) == 0) {
checkScalarValue(codePoint); if (!checkScalarValue(codePoint, strict)) {
codePoint = 0xFFFD;
}
symbol.append(Character.toChars(((codePoint >> 12) & 0x0F) | 0xE0)); symbol.append(Character.toChars(((codePoint >> 12) & 0x0F) | 0xE0));
symbol.append(createByte(codePoint, 6)); symbol.append(createByte(codePoint, 6));
} else if ((codePoint & 0xFFE00000) == 0) { } else if ((codePoint & 0xFFE00000) == 0) {
@@ -78,7 +92,7 @@ public final class UTF8 {
return Character.toChars(((codePoint >> shift) & 0x3F) | 0x80); return Character.toChars(((codePoint >> shift) & 0x3F) | 0x80);
} }
private static int decodeSymbol() throws UTF8Exception { private static int decodeSymbol(boolean strict) throws UTF8Exception {
int byte1; int byte1;
int byte2; int byte2;
int byte3; int byte3;
@@ -115,8 +129,7 @@ public final class UTF8 {
byte3 = readContinuationByte(); byte3 = readContinuationByte();
codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
if (codePoint >= 0x0800) { if (codePoint >= 0x0800) {
checkScalarValue(codePoint); return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD;
return codePoint;
} else { } else {
throw new UTF8Exception(INVALID_CONTINUATION_BYTE); throw new UTF8Exception(INVALID_CONTINUATION_BYTE);
} }
@@ -158,13 +171,17 @@ public final class UTF8 {
return output.toString(); return output.toString();
} }
private static void checkScalarValue(int codePoint) throws UTF8Exception { private static boolean checkScalarValue(int codePoint, boolean strict) throws UTF8Exception {
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) { if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
throw new UTF8Exception( if (strict) {
"Lone surrogate U+" + Integer.toHexString(codePoint).toUpperCase() + throw new UTF8Exception(
" is not a scalar value" "Lone surrogate U+" + Integer.toHexString(codePoint).toUpperCase() +
); " is not a scalar value"
);
}
return false;
} }
return true;
} }
private static int[] listToArray(List<Integer> list) { private static int[] listToArray(List<Integer> list) {
@@ -175,4 +192,8 @@ public final class UTF8 {
} }
return array; return array;
} }
public static class Options {
public boolean strict = true;
}
} }