ICU-22707 adjust UTS46 for Unicode 16

See #3130
unicode-org · Sep 6, 2024 · 415a7ac · 415a7ac
1 parent 6d67afc
commit 415a7ac
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 42 deletions.
diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp
@@ -756,7 +756,12 @@ UTS46::processLabel(UnicodeString &dest,
         if(U_FAILURE(errorCode)) {
             return labelLength;
         }
-        if(!isValid) {
+        // Unicode 15.1 UTS #46:
+        // Added an additional condition in 4.1 Validity Criteria to
+        // disallow labels such as xn--xn---epa., which do not round-trip.
+        // --> Validity Criteria new criterion 4:
+        // If not CheckHyphens, the label must not begin with “xn--”.
+        if(!isValid || fromPunycode.startsWith(UnicodeString::readOnlyAlias(u"xn--"))) {
             info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
             return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
         }

diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp
@@ -340,6 +340,18 @@ void UTS46Test::TestACELabelEdgeCases() {
         idna->labelToUnicode(u"Xn---", result, info, errorCode);
         assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
     }
+
+    {
+        // Unicode 15.1 UTS #46:
+        // Added an additional condition in 4.1 Validity Criteria to
+        // disallow labels such as xn--xn---epa., which do not round-trip.
+        // --> Validity Criteria new criterion 4:
+        // If not CheckHyphens, the label must not begin with “xn--”.
+        IDNAInfo info;
+        idna->labelToUnicode("xn--xn---epa", result, info, errorCode);
+        assertTrue("error for xn--xn---epa",
+                (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
+    }
 }
 
 void UTS46Test::TestTooLong() {
@@ -1016,9 +1028,15 @@ idnaTestLineFn(void *context,
     reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
 }
 
-UnicodeString s16FromField(char *(&field)[2]) {
+UnicodeString s16FromField(char *(&field)[2], const UnicodeString &sameAs) {
     int32_t length = static_cast<int32_t>(field[1] - field[0]);
-    return UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
+    UnicodeString s = UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
+    if (s.isEmpty()) {
+        s = sameAs;  // blank means same as another string
+    } else if (s == u"\"\"") {
+        s.remove();  // explicit empty string (new in Unicode 16)
+    }
+    return s;
 }
 
 std::string statusFromField(char *(&field)[2]) {
@@ -1049,6 +1067,20 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
         if (strcmp(status, reinterpret_cast<const char*>(u8"[]")) != 0) {
             expectedHasErrors = true;
         }
+        // ICU workaround:
+        // We do effectively VerifyDnsLength (we always check for lengths), except,
+        // based on past bug reports, we do not do the following in UTS #46 ToASCII:
+        // When VerifyDnsLength is true, the empty root label is disallowed.
+        // Ignore the expected error if it is the only one.
+        // TODO: ICU-22882 - Report the empty root label separately from empty non-root labels.
+        if (strncmp(type, "toASCII", 7) == 0 &&  // startsWith
+                strcmp(status, "[A4_2]") == 0 && !info.hasErrors()) {
+            if (result.endsWith(UnicodeString::readOnlyAlias(u".")) &&
+                    // !contains
+                    result.indexOf(UnicodeString::readOnlyAlias(u"..")) < 0) {
+                expectedHasErrors = false;
+            }
+        }
     }
     if (expectedHasErrors != info.hasErrors()) {
         errln("%s  expected errors %s %d != %d = actual has errors: %04lx\n    %s",
@@ -1064,16 +1096,15 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
 void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
     // IdnaTestV2.txt (since Unicode 11)
     // Column 1: source
-    // The source string to be tested
-    UnicodeString source = s16FromField(fields[0]);
+    // The source string to be tested.
+    // "" means the empty string.
+    UnicodeString source = s16FromField(fields[0], UnicodeString());
 
     // Column 2: toUnicode
     // The result of applying toUnicode to the source, with Transitional_Processing=false.
     // A blank value means the same as the source value.
-    UnicodeString toUnicode = s16FromField(fields[1]);
-    if (toUnicode.isEmpty()) {
-        toUnicode = source;
-    }
+    // "" means the empty string.
+    UnicodeString toUnicode = s16FromField(fields[1], source);
 
     // Column 3: toUnicodeStatus
     // A set of status codes, each corresponding to a particular test.
@@ -1083,10 +1114,8 @@ void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
     // Column 4: toAsciiN
     // The result of applying toASCII to the source, with Transitional_Processing=false.
     // A blank value means the same as the toUnicode value.
-    UnicodeString toAsciiN = s16FromField(fields[3]);
-    if (toAsciiN.isEmpty()) {
-        toAsciiN = toUnicode;
-    }
+    // "" means the empty string.
+    UnicodeString toAsciiN = s16FromField(fields[3], toUnicode);
 
     // Column 5: toAsciiNStatus
     // A set of status codes, each corresponding to a particular test.
@@ -1099,10 +1128,8 @@ void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
     // Column 6: toAsciiT
     // The result of applying toASCII to the source, with Transitional_Processing=true.
     // A blank value means the same as the toAsciiN value.
-    UnicodeString toAsciiT = s16FromField(fields[5]);
-    if (toAsciiT.isEmpty()) {
-        toAsciiT = toAsciiN;
-    }
+    // "" means the empty string.
+    UnicodeString toAsciiT = s16FromField(fields[5], toAsciiN);
 
     // Column 7: toAsciiTStatus
     // A set of status codes, each corresponding to a particular test.
@@ -1133,12 +1160,7 @@ U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
 
 }  // namespace
 
-// http://www.unicode.org/Public/idna/latest/IdnaTest.txt
 void UTS46Test::IdnaTest() {
-    if (logKnownIssue("ICU-22707",
-                      "The UTS #46 spec is changing for Unicode 16; need to adjust ICU impl")) {
-        return;
-    }
     IcuTestErrorCode errorCode(*this, "IdnaTest");
     const char *sourceTestDataPath = getSourceTestData(errorCode);
     if (errorCode.errIfFailureAndReset("unable to find the source/test/testdata "
@@ -1158,7 +1180,7 @@ void UTS46Test::IdnaTest() {
     // Comments are indicated with hash marks.
     char *fields[kNumFields][2];
     u_parseDelimitedFile(path.data(), ';', fields, kNumFields, idnaTestLineFn, this, errorCode);
-    if (errorCode.errIfFailureAndReset("error parsing IdnaTest.txt")) {
+    if (errorCode.errIfFailureAndReset("error parsing IdnaTestV2.txt")) {
         return;
     }
 }

diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/impl/UTS46.java b/icu4j/main/core/src/main/java/com/ibm/icu/impl/UTS46.java
@@ -358,7 +358,12 @@ public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info i
             // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
             // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
             boolean isValid=uts46Norm2.isNormalized(fromPunycode);
-            if(!isValid) {
+            // Unicode 15.1 UTS #46:
+            // Added an additional condition in 4.1 Validity Criteria to
+            // disallow labels such as xn--xn---epa., which do not round-trip.
+            // --> Validity Criteria new criterion 4:
+            // If not CheckHyphens, the label must not begin with “xn--”.
+            if(!isValid || startsWithXNDashDash(fromPunycode)) {
                 addLabelError(info, Error.INVALID_ACE_LABEL);
                 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
             }
@@ -488,6 +493,12 @@ public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info i
         }
         return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
     }
+
+    private static boolean startsWithXNDashDash(CharSequence s) {
+        return s.length()>=4 &&
+                s.charAt(0)=='x' && s.charAt(1)=='n' && s.charAt(2)=='-' && s.charAt(3)=='-';
+    }
+
     private int
     markBadACELabel(StringBuilder dest,
                     int labelStart, int labelLength,

diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/normalizer/UTS46Test.java
@@ -168,6 +168,15 @@ public void TestACELabelEdgeCases() {
         info=new IDNA.Info();
         idna.labelToUnicode("Xn---", result, info);
         assertTrue("empty Xn---", info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        // Unicode 15.1 UTS #46:
+        // Added an additional condition in 4.1 Validity Criteria to
+        // disallow labels such as xn--xn---epa., which do not round-trip.
+        // --> Validity Criteria new criterion 4:
+        // If not CheckHyphens, the label must not begin with “xn--”.
+        idna.labelToUnicode("xn--xn---epa", result, info);
+        assertTrue("error for xn--xn---epa",
+                info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
     }
 
     @Test
@@ -813,6 +822,16 @@ public void TestSomeCases() {
         }
     }
 
+    private static String escapeTestString(String s, String sameAs) {
+        s = Utility.unescape(s.trim());
+        if (s.isEmpty()) {
+            s = sameAs;  // blank means same as another string
+        } else if (s.equals("\"\"")) {
+            s = "";  // explicit empty string (new in Unicode 16)
+        }
+        return s;
+    }
+
     private void checkIdnaTestResult(String line, String type,
             String expected, CharSequence result, String status, IDNA.Info info) {
         // An error in toUnicode or toASCII is indicated by a value in square brackets,
@@ -826,6 +845,18 @@ private void checkIdnaTestResult(String line, String type,
             if (!status.equals("[]")) {
                 expectedHasErrors = true;
             }
+            // ICU workaround:
+            // We do effectively VerifyDnsLength (we always check for lengths), except,
+            // based on past bug reports, we do not do the following in UTS #46 ToASCII:
+            // When VerifyDnsLength is true, the empty root label is disallowed.
+            // Ignore the expected error if it is the only one.
+            // TODO: ICU-22882 - Report the empty root label separately from empty non-root labels.
+            if (type.startsWith("toASCII") && status.equals("[A4_2]") && !info.hasErrors()) {
+                String a = result.toString();
+                if (a.endsWith(".") && !a.contains("..")) {
+                    expectedHasErrors = false;
+                }
+            }
         }
         if (expectedHasErrors != info.hasErrors()) {
             errln(String.format(
@@ -841,10 +872,6 @@ private void checkIdnaTestResult(String line, String type,
 
     @Test
     public void IdnaTest() throws IOException {
-        if (logKnownIssue("ICU-22707",
-                "The UTS #46 spec is changing for Unicode 16; need to adjust ICU impl")) {
-            return;
-        }
         BufferedReader idnaTestFile = TestUtil.getDataReader("unicode/IdnaTestV2.txt", "UTF-8");
         Pattern semi = Pattern.compile(";");
         try {
@@ -862,16 +889,15 @@ public void IdnaTest() throws IOException {
 
                 // IdnaTestV2.txt (since Unicode 11)
                 // Column 1: source
-                // The source string to be tested
-                String source = Utility.unescape(fields[0].trim());
+                // The source string to be tested.
+                // "" means the empty string.
+                String source = escapeTestString(fields[0], "");
 
                 // Column 2: toUnicode
                 // The result of applying toUnicode to the source, with Transitional_Processing=false.
                 // A blank value means the same as the source value.
-                String toUnicode = Utility.unescape(fields[1].trim());
-                if (toUnicode.isEmpty()) {
-                    toUnicode = source;
-                }
+                // "" means the empty string.
+                String toUnicode = escapeTestString(fields[1], source);
 
                 // Column 3: toUnicodeStatus
                 // A set of status codes, each corresponding to a particular test.
@@ -881,10 +907,8 @@ public void IdnaTest() throws IOException {
                 // Column 4: toAsciiN
                 // The result of applying toASCII to the source, with Transitional_Processing=false.
                 // A blank value means the same as the toUnicode value.
-                String toAsciiN = Utility.unescape(fields[3].trim());
-                if (toAsciiN.isEmpty()) {
-                    toAsciiN = toUnicode;
-                }
+                // "" means the empty string.
+                String toAsciiN = escapeTestString(fields[3], toUnicode);
 
                 // Column 5: toAsciiNStatus
                 // A set of status codes, each corresponding to a particular test.
@@ -897,10 +921,8 @@ public void IdnaTest() throws IOException {
                 // Column 6: toAsciiT
                 // The result of applying toASCII to the source, with Transitional_Processing=true.
                 // A blank value means the same as the toAsciiN value.
-                String toAsciiT = Utility.unescape(fields[5].trim());
-                if (toAsciiT.isEmpty()) {
-                    toAsciiT = toAsciiN;
-                }
+                // "" means the empty string.
+                String toAsciiT = escapeTestString(fields[5], toAsciiN);
 
                 // Column 7: toAsciiTStatus
                 // A set of status codes, each corresponding to a particular test.