Skip to content

Commit

Permalink
ICU-22707 adjust UTS46 for Unicode 16
Browse files Browse the repository at this point in the history
See #3130
  • Loading branch information
markusicu committed Sep 6, 2024
1 parent 6d67afc commit 415a7ac
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 42 deletions.
7 changes: 6 additions & 1 deletion icu4c/source/common/uts46.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,12 @@ UTS46::processLabel(UnicodeString &dest,
if(U_FAILURE(errorCode)) {
return labelLength;
}
if(!isValid) {
// Unicode 15.1 UTS #46:
// Added an additional condition in 4.1 Validity Criteria to
// disallow labels such as xn--xn---epa., which do not round-trip.
// --> Validity Criteria new criterion 4:
// If not CheckHyphens, the label must not begin with “xn--”.
if(!isValid || fromPunycode.startsWith(UnicodeString::readOnlyAlias(u"xn--"))) {
info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
}
Expand Down
66 changes: 44 additions & 22 deletions icu4c/source/test/intltest/uts46test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,18 @@ void UTS46Test::TestACELabelEdgeCases() {
idna->labelToUnicode(u"Xn---", result, info, errorCode);
assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}

{
// Unicode 15.1 UTS #46:
// Added an additional condition in 4.1 Validity Criteria to
// disallow labels such as xn--xn---epa., which do not round-trip.
// --> Validity Criteria new criterion 4:
// If not CheckHyphens, the label must not begin with “xn--”.
IDNAInfo info;
idna->labelToUnicode("xn--xn---epa", result, info, errorCode);
assertTrue("error for xn--xn---epa",
(info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
}
}

void UTS46Test::TestTooLong() {
Expand Down Expand Up @@ -1016,9 +1028,15 @@ idnaTestLineFn(void *context,
reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
}

UnicodeString s16FromField(char *(&field)[2]) {
UnicodeString s16FromField(char *(&field)[2], const UnicodeString &sameAs) {
int32_t length = static_cast<int32_t>(field[1] - field[0]);
return UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
UnicodeString s = UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
if (s.isEmpty()) {
s = sameAs; // blank means same as another string
} else if (s == u"\"\"") {
s.remove(); // explicit empty string (new in Unicode 16)
}
return s;
}

std::string statusFromField(char *(&field)[2]) {
Expand Down Expand Up @@ -1049,6 +1067,20 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
if (strcmp(status, reinterpret_cast<const char*>(u8"[]")) != 0) {
expectedHasErrors = true;
}
// ICU workaround:
// We do effectively VerifyDnsLength (we always check for lengths), except,
// based on past bug reports, we do not do the following in UTS #46 ToASCII:
// When VerifyDnsLength is true, the empty root label is disallowed.
// Ignore the expected error if it is the only one.
// TODO: ICU-22882 - Report the empty root label separately from empty non-root labels.
if (strncmp(type, "toASCII", 7) == 0 && // startsWith
strcmp(status, "[A4_2]") == 0 && !info.hasErrors()) {
if (result.endsWith(UnicodeString::readOnlyAlias(u".")) &&
// !contains
result.indexOf(UnicodeString::readOnlyAlias(u"..")) < 0) {
expectedHasErrors = false;
}
}
}
if (expectedHasErrors != info.hasErrors()) {
errln("%s expected errors %s %d != %d = actual has errors: %04lx\n %s",
Expand All @@ -1064,16 +1096,15 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
// IdnaTestV2.txt (since Unicode 11)
// Column 1: source
// The source string to be tested
UnicodeString source = s16FromField(fields[0]);
// The source string to be tested.
// "" means the empty string.
UnicodeString source = s16FromField(fields[0], UnicodeString());

// Column 2: toUnicode
// The result of applying toUnicode to the source, with Transitional_Processing=false.
// A blank value means the same as the source value.
UnicodeString toUnicode = s16FromField(fields[1]);
if (toUnicode.isEmpty()) {
toUnicode = source;
}
// "" means the empty string.
UnicodeString toUnicode = s16FromField(fields[1], source);

// Column 3: toUnicodeStatus
// A set of status codes, each corresponding to a particular test.
Expand All @@ -1083,10 +1114,8 @@ void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
// Column 4: toAsciiN
// The result of applying toASCII to the source, with Transitional_Processing=false.
// A blank value means the same as the toUnicode value.
UnicodeString toAsciiN = s16FromField(fields[3]);
if (toAsciiN.isEmpty()) {
toAsciiN = toUnicode;
}
// "" means the empty string.
UnicodeString toAsciiN = s16FromField(fields[3], toUnicode);

// Column 5: toAsciiNStatus
// A set of status codes, each corresponding to a particular test.
Expand All @@ -1099,10 +1128,8 @@ void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
// Column 6: toAsciiT
// The result of applying toASCII to the source, with Transitional_Processing=true.
// A blank value means the same as the toAsciiN value.
UnicodeString toAsciiT = s16FromField(fields[5]);
if (toAsciiT.isEmpty()) {
toAsciiT = toAsciiN;
}
// "" means the empty string.
UnicodeString toAsciiT = s16FromField(fields[5], toAsciiN);

// Column 7: toAsciiTStatus
// A set of status codes, each corresponding to a particular test.
Expand Down Expand Up @@ -1133,12 +1160,7 @@ U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);

} // namespace

// http://www.unicode.org/Public/idna/latest/IdnaTest.txt
void UTS46Test::IdnaTest() {
if (logKnownIssue("ICU-22707",
"The UTS #46 spec is changing for Unicode 16; need to adjust ICU impl")) {
return;
}
IcuTestErrorCode errorCode(*this, "IdnaTest");
const char *sourceTestDataPath = getSourceTestData(errorCode);
if (errorCode.errIfFailureAndReset("unable to find the source/test/testdata "
Expand All @@ -1158,7 +1180,7 @@ void UTS46Test::IdnaTest() {
// Comments are indicated with hash marks.
char *fields[kNumFields][2];
u_parseDelimitedFile(path.data(), ';', fields, kNumFields, idnaTestLineFn, this, errorCode);
if (errorCode.errIfFailureAndReset("error parsing IdnaTest.txt")) {
if (errorCode.errIfFailureAndReset("error parsing IdnaTestV2.txt")) {
return;
}
}
Expand Down
13 changes: 12 additions & 1 deletion icu4j/main/core/src/main/java/com/ibm/icu/impl/UTS46.java
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,12 @@ public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info i
// In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
// then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
boolean isValid=uts46Norm2.isNormalized(fromPunycode);
if(!isValid) {
// Unicode 15.1 UTS #46:
// Added an additional condition in 4.1 Validity Criteria to
// disallow labels such as xn--xn---epa., which do not round-trip.
// --> Validity Criteria new criterion 4:
// If not CheckHyphens, the label must not begin with “xn--”.
if(!isValid || startsWithXNDashDash(fromPunycode)) {
addLabelError(info, Error.INVALID_ACE_LABEL);
return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
}
Expand Down Expand Up @@ -488,6 +493,12 @@ public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info i
}
return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
}

private static boolean startsWithXNDashDash(CharSequence s) {
return s.length()>=4 &&
s.charAt(0)=='x' && s.charAt(1)=='n' && s.charAt(2)=='-' && s.charAt(3)=='-';
}

private int
markBadACELabel(StringBuilder dest,
int labelStart, int labelLength,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,15 @@ public void TestACELabelEdgeCases() {
info=new IDNA.Info();
idna.labelToUnicode("Xn---", result, info);
assertTrue("empty Xn---", info.getErrors().contains(IDNA.Error.PUNYCODE));

// Unicode 15.1 UTS #46:
// Added an additional condition in 4.1 Validity Criteria to
// disallow labels such as xn--xn---epa., which do not round-trip.
// --> Validity Criteria new criterion 4:
// If not CheckHyphens, the label must not begin with “xn--”.
idna.labelToUnicode("xn--xn---epa", result, info);
assertTrue("error for xn--xn---epa",
info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
}

@Test
Expand Down Expand Up @@ -813,6 +822,16 @@ public void TestSomeCases() {
}
}

private static String escapeTestString(String s, String sameAs) {
s = Utility.unescape(s.trim());
if (s.isEmpty()) {
s = sameAs; // blank means same as another string
} else if (s.equals("\"\"")) {
s = ""; // explicit empty string (new in Unicode 16)
}
return s;
}

private void checkIdnaTestResult(String line, String type,
String expected, CharSequence result, String status, IDNA.Info info) {
// An error in toUnicode or toASCII is indicated by a value in square brackets,
Expand All @@ -826,6 +845,18 @@ private void checkIdnaTestResult(String line, String type,
if (!status.equals("[]")) {
expectedHasErrors = true;
}
// ICU workaround:
// We do effectively VerifyDnsLength (we always check for lengths), except,
// based on past bug reports, we do not do the following in UTS #46 ToASCII:
// When VerifyDnsLength is true, the empty root label is disallowed.
// Ignore the expected error if it is the only one.
// TODO: ICU-22882 - Report the empty root label separately from empty non-root labels.
if (type.startsWith("toASCII") && status.equals("[A4_2]") && !info.hasErrors()) {
String a = result.toString();
if (a.endsWith(".") && !a.contains("..")) {
expectedHasErrors = false;
}
}
}
if (expectedHasErrors != info.hasErrors()) {
errln(String.format(
Expand All @@ -841,10 +872,6 @@ private void checkIdnaTestResult(String line, String type,

@Test
public void IdnaTest() throws IOException {
if (logKnownIssue("ICU-22707",
"The UTS #46 spec is changing for Unicode 16; need to adjust ICU impl")) {
return;
}
BufferedReader idnaTestFile = TestUtil.getDataReader("unicode/IdnaTestV2.txt", "UTF-8");
Pattern semi = Pattern.compile(";");
try {
Expand All @@ -862,16 +889,15 @@ public void IdnaTest() throws IOException {

// IdnaTestV2.txt (since Unicode 11)
// Column 1: source
// The source string to be tested
String source = Utility.unescape(fields[0].trim());
// The source string to be tested.
// "" means the empty string.
String source = escapeTestString(fields[0], "");

// Column 2: toUnicode
// The result of applying toUnicode to the source, with Transitional_Processing=false.
// A blank value means the same as the source value.
String toUnicode = Utility.unescape(fields[1].trim());
if (toUnicode.isEmpty()) {
toUnicode = source;
}
// "" means the empty string.
String toUnicode = escapeTestString(fields[1], source);

// Column 3: toUnicodeStatus
// A set of status codes, each corresponding to a particular test.
Expand All @@ -881,10 +907,8 @@ public void IdnaTest() throws IOException {
// Column 4: toAsciiN
// The result of applying toASCII to the source, with Transitional_Processing=false.
// A blank value means the same as the toUnicode value.
String toAsciiN = Utility.unescape(fields[3].trim());
if (toAsciiN.isEmpty()) {
toAsciiN = toUnicode;
}
// "" means the empty string.
String toAsciiN = escapeTestString(fields[3], toUnicode);

// Column 5: toAsciiNStatus
// A set of status codes, each corresponding to a particular test.
Expand All @@ -897,10 +921,8 @@ public void IdnaTest() throws IOException {
// Column 6: toAsciiT
// The result of applying toASCII to the source, with Transitional_Processing=true.
// A blank value means the same as the toAsciiN value.
String toAsciiT = Utility.unescape(fields[5].trim());
if (toAsciiT.isEmpty()) {
toAsciiT = toAsciiN;
}
// "" means the empty string.
String toAsciiT = escapeTestString(fields[5], toAsciiN);

// Column 7: toAsciiTStatus
// A set of status codes, each corresponding to a particular test.
Expand Down

0 comments on commit 415a7ac

Please sign in to comment.