Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 48 additions & 23 deletions make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

package build.tools.generatecharacter;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.IntStream;

public class CaseFolding {

Expand All @@ -42,32 +41,58 @@ public static void main(String[] args) throws Throwable {
var templateFile = Paths.get(args[0]);
var caseFoldingTxt = Paths.get(args[1]);
var genSrcFile = Paths.get(args[2]);
var supportedTypes = "^.*; [CTS]; .*$";

// java.lang
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
var caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[] {cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
var fields = line.split("; ");
var cp = Integer.parseInt(fields[0], 16);
fields = fields[2].trim().split(" ");
var folding = new int[fields.length];
for (int i = 0; i < folding.length; i++) {
folding[i] = Integer.parseInt(fields[i], 16);
}
var foldingChars = Arrays.stream(folding)
.mapToObj(Character::toChars)
.flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int) chars[i]))
.toArray();
return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)",
cp,
Arrays.stream(foldingChars)
.mapToObj(c -> String.format("0x%04x", c))
.collect(Collectors.joining(", ", "new char[] {", "}"))
);
})
.collect(Collectors.joining(",\n", "", ""));
// util.regex
var expandedSupportedTypes = "^.*; [CTS]; .*$";
var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[]{cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));

// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
// 0049; T; 0131; # LATIN CAPITAL LETTER I
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);

// Generate .java file
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line)
.map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
}
17 changes: 17 additions & 0 deletions make/modules/java.base/gensrc/GensrcCharacterData.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA)

################################################################################


GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we target the package jdk.internal.lang instead of jdk.internal.java.lang? I think the previous one is the convention set forth by stable values.


STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt

$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(STRINGCASEFOLDING_TEMPLATE) \
$(CASEFOLDINGTXT) \
$(GENSRC_STRINGCASEFOLDING)

TARGETS += $(GENSRC_STRINGCASEFOLDING)


endif # include guard
include MakeIncludeEnd.gmk
17 changes: 0 additions & 17 deletions make/modules/java.base/gensrc/GensrcRegex.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)

################################################################################

GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java

CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt

$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(CASEFOLDINGTEMP) \
$(CASEFOLDINGTXT) \
$(GENSRC_CASEFOLDING)

TARGETS += $(GENSRC_CASEFOLDING)

################################################################################

endif # include guard
include MakeIncludeEnd.gmk
123 changes: 123 additions & 0 deletions src/java.base/share/classes/java/lang/String.java
Original file line number Diff line number Diff line change
Expand Up @@ -2180,6 +2180,7 @@ public boolean contentEquals(CharSequence cs) {
* false} otherwise
*
* @see #equals(Object)
* @see #equalsFoldCase(String)
* @see #codePoints()
*/
public boolean equalsIgnoreCase(String anotherString) {
Expand All @@ -2189,6 +2190,56 @@ public boolean equalsIgnoreCase(String anotherString) {
&& regionMatches(true, 0, anotherString, 0, length());
}

/**
* Compares this {@code String} to another {@code String} for equality,
* using <em>{@index "Unicode case folding"}</em>. Two strings are considered equal
* by this method if their case-folded forms are identical.
* <p>
* Case folding is defined by the Unicode Standard in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
* including 1:M mappings. For example, {@code "Fuß".equalsFoldCase("FUSS")}
* returns {@code true}, since the character {@code U+00DF} (sharp s) folds
* to {@code "ss"}.
* <p>
* Case folding is locale-independent and language-neutral, unlike
* locale-sensitive transformations such as {@link #toLowerCase()} or
* {@link #toUpperCase()}. It is intended for caseless matching,
* searching, and indexing.
*
* @apiNote
* This method is the Unicode-compliant alternative to
* {@link #equalsIgnoreCase(String)}. It implements full case folding as
* defined by the Unicode Standard, which may differ from the simpler
* per-character mapping performed by {@code equalsIgnoreCase}.
* For example:
* <pre>{@snippet lang=java :
* String a = "Fuß";
* String b = "FUSS";
* boolean equalsFoldCase = a.equalsFoldCase(b); // returns true
* boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false
* }</pre>
*
* @param anotherString
* The {@code String} to compare this {@code String} against
*
* @return {@code true} if the given object is not {@code null} and represents
* the same sequence of characters as this string under Unicode case
* folding; {@code false} otherwise.
*
* @see #compareToFoldCase(String)
* @see #equalsIgnoreCase(String)
* @since 26
*/
public boolean equalsFoldCase(String anotherString) {
if (this == anotherString) {
return true;
}
if (anotherString == null) {
return false;
}
return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0;
}

/**
* Compares two strings lexicographically.
* The comparison is based on the Unicode value of each character in
Expand Down Expand Up @@ -2304,12 +2355,84 @@ public int compare(String s1, String s2) {
* than this String, ignoring case considerations.
* @see java.text.Collator
* @see #codePoints()
* @see #compareToFoldCase(String)
* @since 1.2
*/
public int compareToIgnoreCase(String str) {
return CASE_INSENSITIVE_ORDER.compare(this, str);
}

/**
* A Comparator that orders {@code String} objects as by
* {@link #compareToFoldCase(String) compareToFoldCase()}.
*
* @see #compareToFoldCase(String)
* @since 26
*/
public static final Comparator<String> UNICODE_CASEFOLD_ORDER
= new FoldCaseComparator();

private static class FoldCaseComparator implements Comparator<String> {

@Override
public int compare(String s1, String s2) {
byte[] v1 = s1.value;
byte[] v2 = s2.value;
if (s1.coder == s2.coder()) {
return s1.coder == LATIN1 ? StringLatin1.compareToFC(v1, v2)
: StringUTF16.compareToFC(v1, v2);
}
return s1.coder == LATIN1 ? StringLatin1.compareToFC_UTF16(v1, v2)
: StringUTF16.compareToFC_Latin1(v1, v2);
}
}

/**
* Compares two strings lexicographically using <em>{@index "Unicode case folding"}</em>.
* This method returns an integer whose sign is that of calling {@code compareTo}
* on the Unicode case folded version of the strings. Unicode Case folding
* eliminates differences in case according to the Unicode Standard, using the
* mappings defined in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
* including 1:M mappings, such as {@code"ß"} → {@code }"ss"}.
* <p>
* Case folding is a locale-independent, language-neutral form of case mapping,
* primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)},
* which applies a simpler locale-insensitive uppercase mapping. This method
* follows the Unicode <em>{@index "full"}</em> case folding, providing stable and
* consistent results across all environments.
* <p>
* Note that this method does <em>not</em> take locale into account, and may
* produce results that differ from locale-sensitive ordering. Use
* {@link java.text.Collator} for locale-sensitive comparison.
*
* @apiNote
* This method is the Unicode-compliant alternative to
* {@link #compareToIgnoreCase(String)}. It implements the
* <em>{@index "full case folding"}</em> as defined by the Unicode Standard, which
* may differ from the simpler per-character mapping performed by
* {@code compareToIgnoreCase}.
* For example:
* <pre>{@snippet lang=java :
* String a = "Fuß";
* String b = "FUSS";
* int cmpFoldCase = a.compareToFoldCase(b); // returns 0
* int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
* }</pre>
*
* @param str the {@code String} to be compared.
* @return a negative integer, zero, or a positive integer as the specified
* String is greater than, equal to, or less than this String,
* ignoring case considerations by case folding.
* @see java.text.Collator
* @see #compareToIgnoreCase(String)
* @see #equalsFoldCase(String)
* @since 26
*/
public int compareToFoldCase(String str) {
return UNICODE_CASEFOLD_ORDER.compare(this, str);
}

/**
* Tests if two string regions are equal.
* <p>
Expand Down
Loading