Skip to content

Commit 44460bc

Browse files
ckettijhy
andauthored
Use proper escaping in TokenQueue.escapeCssIdentifier() (#2305)
--------- Co-authored-by: Jonathan Hedley <jonathan@hedley.net>
1 parent 464dc27 commit 44460bc

File tree

5 files changed

+211
-8
lines changed

5 files changed

+211
-8
lines changed

src/main/java/org/jsoup/parser/TokenQueue.java

+57-6
Original file line numberDiff line numberDiff line change
@@ -273,23 +273,74 @@ public static String unescape(String in) {
273273
return StringUtil.releaseBuilder(out);
274274
}
275275

276-
/*
277-
Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
278-
valid in a selector.
276+
/**
277+
Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
278+
valid in a selector.
279+
280+
@see <a href="https://www.w3.org/TR/cssom-1/#serialize-an-identifier">CSS Object Model, serialize an identifier</a>
279281
*/
280282
public static String escapeCssIdentifier(String in) {
283+
if (in.isEmpty()) return in;
284+
281285
StringBuilder out = StringUtil.borrowBuilder();
282286
TokenQueue q = new TokenQueue(in);
287+
288+
char firstChar = q.current();
289+
if (firstChar == Hyphen_Minus) {
290+
q.advance();
291+
if (q.isEmpty()) {
292+
// If the character is the first character and is a "-" (U+002D), and there is no second character, then
293+
// the escaped character.
294+
appendEscaped(out, Hyphen_Minus);
295+
} else {
296+
out.append(Hyphen_Minus);
297+
298+
char secondChar = q.current();
299+
if (CharacterReader.isDigit(secondChar)) {
300+
// If the character is the second character and is in the range [0-9] (U+0030 to U+0039) and the
301+
// first character is a "-" (U+002D), then the character escaped as code point.
302+
appendEscapedCodepoint(out, q.consume());
303+
}
304+
}
305+
} else if (CharacterReader.isDigit(firstChar)) {
306+
// If the character is the first character and is in the range [0-9] (U+0030 to U+0039), then the character
307+
// escaped as code point.
308+
appendEscapedCodepoint(out, q.consume());
309+
}
310+
283311
while (!q.isEmpty()) {
284-
if (q.matchesCssIdentifier(CssIdentifierChars)) {
285-
out.append(q.consume());
312+
// Note: It's fine to iterate on chars because non-ASCII characters are never escaped. So surrogate pairs
313+
// are kept intact.
314+
char c = q.consume();
315+
if (c == Unicode_Null) {
316+
// If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD).
317+
out.append(Replacement);
318+
} else if (c <= '\u001F' || c == '\u007F') {
319+
// If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F, then the character
320+
// escaped as code point.
321+
appendEscapedCodepoint(out, c);
322+
} else if (isIdent(c)) {
323+
// If the character is not handled by one of the above rules and is greater than or equal to U+0080,
324+
// is "-" (U+002D) or "_" (U+005F), or is in one of the ranges [0-9] (U+0030 to U+0039),
325+
// [A-Z] (U+0041 to U+005A), or [a-z] (U+0061 to U+007A), then the character itself.
326+
out.append(c);
286327
} else {
287-
out.append(Esc).append(q.consume());
328+
// Otherwise, the escaped character.
329+
appendEscaped(out, c);
288330
}
289331
}
332+
290333
return StringUtil.releaseBuilder(out);
291334
}
292335

336+
private static void appendEscaped(StringBuilder out, char c) {
337+
out.append(Esc).append(c);
338+
}
339+
340+
private static void appendEscapedCodepoint(StringBuilder out, char c) {
341+
out.append(Esc).append(Integer.toHexString(c)).append(' ');
342+
}
343+
293344
/**
294345
* Pulls the next run of whitespace characters of the queue.
295346
* @return Whether consuming whitespace or not

src/main/java/org/jsoup/select/Selector.java

+28
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import org.jsoup.helper.Validate;
44
import org.jsoup.nodes.Element;
5+
import org.jsoup.parser.TokenQueue;
56
import org.jspecify.annotations.Nullable;
67

78
import java.util.Collection;
@@ -218,6 +219,33 @@ static Elements filterOut(Collection<Element> elements, Collection<Element> outs
218219
return null;
219220
}
220221

222+
/**
223+
Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
224+
valid in a selector.
225+
226+
@see <a href="https://www.w3.org/TR/cssom-1/#serialize-an-identifier">CSS Object Model, serialize an identifier</a>
227+
@since 1.20.1
228+
*/
229+
public static String escapeCssIdentifier(String in) {
230+
return TokenQueue.escapeCssIdentifier(in);
231+
}
232+
233+
/**
234+
Consume a CSS identifier (ID or class) off the queue.
235+
<p>Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead
236+
of {@code \31}.</p>
237+
238+
@return The unescaped identifier.
239+
@throws IllegalArgumentException if an invalid escape sequence was found.
240+
@see <a href="https://www.w3.org/TR/css-syntax-3/#consume-name">CSS Syntax Module Level 3, Consume an ident sequence</a>
241+
@see <a href="https://www.w3.org/TR/css-syntax-3/#typedef-ident-token">CSS Syntax Module Level 3, ident-token</a>
242+
@since 1.20.1
243+
*/
244+
public static String unescapeCssIdentifier(String in) {
245+
TokenQueue tq = new TokenQueue(in);
246+
return tq.consumeCssIdentifier();
247+
}
248+
221249
public static class SelectorParseException extends IllegalStateException {
222250
public SelectorParseException(String msg) {
223251
super(msg);

src/test/java/org/jsoup/nodes/ElementTest.java

+20
Original file line numberDiff line numberDiff line change
@@ -2697,6 +2697,26 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
26972697
assertSelectedOwnText(selected, "One");
26982698
}
26992699

2700+
@Test void cssSelectorCombined() {
2701+
// https://github.com/jhy/jsoup/issues/1984
2702+
Document doc = Jsoup.parse("<img class='e\u0301'><p class=👨‍👨‍👧‍👧></p><a class='\uD83D\uDC68\u200D\uD83D\uDC68\u200D\uD83D\uDC67\u200D\uD83D\uDC67'></a>");
2703+
Element img = doc.expectFirst("img");
2704+
Element p = doc.expectFirst("p");
2705+
Element a = doc.expectFirst("a");
2706+
2707+
String imgQ = img.cssSelector();
2708+
String pQ = p.cssSelector();
2709+
String aQ = a.cssSelector();
2710+
2711+
assertEquals("html > body > img.é", imgQ); // previously was img.e\́; chrome gives literal body > img.e\\u0301
2712+
assertEquals("html > body > p.👨‍👨‍👧‍👧", pQ); // chrome gives body > p.👨‍👨‍👧‍👧
2713+
assertEquals("html > body > a.👨‍👨‍👧‍👧", aQ); // body > a.👨‍👨‍👧‍👧
2714+
2715+
assertSame(img, doc.expectFirst(imgQ));
2716+
assertSame(p, doc.expectFirst(pQ));
2717+
assertSame(a, doc.expectFirst(aQ));
2718+
}
2719+
27002720
@Test void orphanSiblings() {
27012721
Element el = new Element("div");
27022722
assertEquals(0, el.siblingElements().size());

src/test/java/org/jsoup/parser/TokenQueueTest.java

+93-2
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,99 @@ public class TokenQueueTest {
5454
assertEquals("\\&", TokenQueue.unescape("\\\\\\&"));
5555
}
5656

57-
@Test public void escapeCssIdentifier() {
58-
assertEquals("one\\#two\\.three\\/four\\\\five", TokenQueue.escapeCssIdentifier("one#two.three/four\\five"));
57+
@ParameterizedTest
58+
@MethodSource("escapeCssIdentifier_WebPlatformTestParameters")
59+
@MethodSource("escapeCssIdentifier_additionalParameters")
60+
public void escapeCssIdentifier(String expected, String input) {
61+
assertEquals(expected, TokenQueue.escapeCssIdentifier(input));
62+
}
63+
64+
// https://github.com/web-platform-tests/wpt/blob/328fa1c67bf5dfa6f24571d4c41dd10224b6d247/css/cssom/escape.html
65+
private static Stream<Arguments> escapeCssIdentifier_WebPlatformTestParameters() {
66+
return Stream.of(
67+
Arguments.of("", ""),
68+
69+
// Null bytes
70+
Arguments.of("\uFFFD", "\0"),
71+
Arguments.of("a\uFFFD", "a\0"),
72+
Arguments.of("\uFFFDb", "\0b"),
73+
Arguments.of("a\uFFFDb", "a\0b"),
74+
75+
// Replacement character
76+
Arguments.of("\uFFFD", "\uFFFD"),
77+
Arguments.of("a\uFFFD", "a\uFFFD"),
78+
Arguments.of("\uFFFDb", "\uFFFDb"),
79+
Arguments.of("a\uFFFDb", "a\uFFFDb"),
80+
81+
// Number prefix
82+
Arguments.of("\\30 a", "0a"),
83+
Arguments.of("\\31 a", "1a"),
84+
Arguments.of("\\32 a", "2a"),
85+
Arguments.of("\\33 a", "3a"),
86+
Arguments.of("\\34 a", "4a"),
87+
Arguments.of("\\35 a", "5a"),
88+
Arguments.of("\\36 a", "6a"),
89+
Arguments.of("\\37 a", "7a"),
90+
Arguments.of("\\38 a", "8a"),
91+
Arguments.of("\\39 a", "9a"),
92+
93+
// Letter number prefix
94+
Arguments.of("a0b", "a0b"),
95+
Arguments.of("a1b", "a1b"),
96+
Arguments.of("a2b", "a2b"),
97+
Arguments.of("a3b", "a3b"),
98+
Arguments.of("a4b", "a4b"),
99+
Arguments.of("a5b", "a5b"),
100+
Arguments.of("a6b", "a6b"),
101+
Arguments.of("a7b", "a7b"),
102+
Arguments.of("a8b", "a8b"),
103+
Arguments.of("a9b", "a9b"),
104+
105+
// Dash number prefix
106+
Arguments.of("-\\30 a", "-0a"),
107+
Arguments.of("-\\31 a", "-1a"),
108+
Arguments.of("-\\32 a", "-2a"),
109+
Arguments.of("-\\33 a", "-3a"),
110+
Arguments.of("-\\34 a", "-4a"),
111+
Arguments.of("-\\35 a", "-5a"),
112+
Arguments.of("-\\36 a", "-6a"),
113+
Arguments.of("-\\37 a", "-7a"),
114+
Arguments.of("-\\38 a", "-8a"),
115+
Arguments.of("-\\39 a", "-9a"),
116+
117+
// Double dash prefix
118+
Arguments.of("--a", "--a"),
119+
120+
// Various tests
121+
Arguments.of("\\1 \\2 \\1e \\1f ", "\u0001\u0002\u001E\u001F"),
122+
Arguments.of("\u0080\u002D\u005F\u00A9", "\u0080\u002D\u005F\u00A9"),
123+
Arguments.of("\\7f \u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F", "\u007F\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F"),
124+
Arguments.of("\u00A0\u00A1\u00A2", "\u00A0\u00A1\u00A2"),
125+
Arguments.of("a0123456789b", "a0123456789b"),
126+
Arguments.of("abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz"),
127+
Arguments.of("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
128+
129+
Arguments.of("hello\\\\world", "hello\\world"), // Backslashes get backslash-escaped
130+
Arguments.of("hello\u1234world", "hello\u1234world"), // Code points greater than U+0080 are preserved
131+
Arguments.of("\\-", "-"), // CSS.escape: Single dash escaped
132+
133+
Arguments.of("\\ \\!xy", "\u0020\u0021\u0078\u0079"),
134+
135+
// astral symbol (U+1D306 TETRAGRAM FOR CENTRE)
136+
Arguments.of("\uD834\uDF06", "\uD834\uDF06"),
137+
138+
// lone surrogates
139+
Arguments.of("\uDF06", "\uDF06"),
140+
Arguments.of("\uD834", "\uD834")
141+
);
142+
}
143+
144+
private static Stream<Arguments> escapeCssIdentifier_additionalParameters() {
145+
return Stream.of(
146+
Arguments.of("one\\#two\\.three\\/four\\\\five", "one#two.three/four\\five"),
147+
Arguments.of("-a", "-a"),
148+
Arguments.of("--", "--")
149+
);
59150
}
60151

61152
@Test public void chompToIgnoreCase() {

src/test/java/org/jsoup/select/SelectorTest.java

+13
Original file line numberDiff line numberDiff line change
@@ -1461,4 +1461,17 @@ public void testAncestorChain() {
14611461
Element img = doc.expectFirst(q);
14621462
assertEquals("img", img.tagName());
14631463
}
1464+
1465+
@Test void escapeCssIdentifier() {
1466+
// thorough tests are in TokenQueue
1467+
assertEquals("-\\30 a", Selector.escapeCssIdentifier("-0a"));
1468+
assertEquals("a0b", Selector.escapeCssIdentifier("a0b"));
1469+
}
1470+
1471+
@Test void unescapeCssIdentifier() {
1472+
// thorough tests are in TokenQueue
1473+
assertEquals("-0a", Selector.unescapeCssIdentifier("-\\30 a"));
1474+
assertEquals("a0b", Selector.unescapeCssIdentifier("a0b"));
1475+
}
1476+
14641477
}

0 commit comments

Comments
 (0)