Skip to content

Commit fea895b

Browse files
committed
Merge branch 'PHP-8.3'
* PHP-8.3: Character indices used by mb_strpos and mb_substr have same meaning, even on invalid strings
2 parents 3c3aba1 + ec348a1 commit fea895b

File tree

4 files changed

+45
-56
lines changed

4 files changed

+45
-56
lines changed

UPGRADING

+7
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ PHP 8.4 UPGRADE NOTES
3535
. mb_encode_numericentity() and mb_decode_numericentity() now check that
3636
the $map is only composed of integers, if not a ValueError is thrown.
3737
. mb_http_input() now always throws a ValueError if the $type is invalid.
38+
. On invalid strings (those with encoding errors), mb_substr() now interprets
39+
character indices in the same manner as most other mbstring functions. This
40+
means that character indices returned by mb_strpos() can be passed to mb_substr().
41+
. For SJIS-Mac (MacJapanese) strings, character indices passed to mb_substr() now
42+
refer to the indices of the Unicode codepoints which are produced when the string
43+
is converted to Unicode. This is significant because around 40 SJIS-Mac characters
44+
convert to a sequence of multiple Unicode codepoints.
3845

3946
- PDO_DBLIB:
4047
. setAttribute, DBLIB_ATTR_STRINGIFY_UNIQUEIDENTIFIER and DBLIB_ATTR_DATETIME_CONVERT

ext/mbstring/mbstring.c

+5-41
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "libmbfl/mbfl/mbfilter_wchar.h"
3939
#include "libmbfl/mbfl/eaw_table.h"
4040
#include "libmbfl/filters/mbfilter_base64.h"
41+
#include "libmbfl/filters/mbfilter_cjk.h"
4142
#include "libmbfl/filters/mbfilter_qprint.h"
4243
#include "libmbfl/filters/mbfilter_htmlent.h"
4344
#include "libmbfl/filters/mbfilter_uuencode.h"
@@ -2112,8 +2113,9 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
21122113
unsigned char *in = (unsigned char*)ZSTR_VAL(input);
21132114
size_t in_len = ZSTR_LEN(input);
21142115

2115-
if (from >= in_len || len == 0) {
2116-
/* No supported text encoding decodes to more than one codepoint per byte
2116+
if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2117+
/* Other than MacJapanese, no supported text encoding decodes to
2118+
* more than one codepoint per byte
21172119
* So if the number of codepoints to skip >= number of input bytes,
21182120
* then definitely the output should be empty */
21192121
return zend_empty_string;
@@ -2134,30 +2136,6 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
21342136
len = in_len;
21352137
}
21362138
return zend_string_init_fast((const char*)in, len);
2137-
} else if (enc->mblen_table) {
2138-
/* The use of the `mblen_table` means that for encodings like MacJapanese,
2139-
* we treat each character in its native charset as "1 character", even if it
2140-
* maps to a sequence of several codepoints */
2141-
const unsigned char *mbtab = enc->mblen_table;
2142-
unsigned char *limit = in + in_len;
2143-
while (from && in < limit) {
2144-
in += mbtab[*in];
2145-
from--;
2146-
}
2147-
if (in >= limit) {
2148-
return zend_empty_string;
2149-
} else if (len == MBFL_SUBSTR_UNTIL_END) {
2150-
return zend_string_init_fast((const char*)in, limit - in);
2151-
}
2152-
unsigned char *end = in;
2153-
while (len && end < limit) {
2154-
end += mbtab[*end];
2155-
len--;
2156-
}
2157-
if (end > limit) {
2158-
end = limit;
2159-
}
2160-
return zend_string_init_fast((const char*)in, end - in);
21612139
}
21622140

21632141
return mb_get_substr_slow(in, in_len, from, len, enc);
@@ -2350,21 +2328,7 @@ PHP_FUNCTION(mb_substr)
23502328

23512329
size_t mblen = 0;
23522330
if (from < 0 || (!len_is_null && len < 0)) {
2353-
if (enc->mblen_table) {
2354-
/* Because we use the `mblen_table` when iterating over the string and
2355-
* extracting the requested part, we also need to use it here for counting
2356-
* the "length" of the string
2357-
* Otherwise, we can get wrong results for text encodings like MacJapanese,
2358-
* where one native 'character' can map to a sequence of several codepoints */
2359-
const unsigned char *mbtab = enc->mblen_table;
2360-
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
2361-
while (p < e) {
2362-
p += mbtab[*p];
2363-
mblen++;
2364-
}
2365-
} else {
2366-
mblen = mb_get_strlen(str, enc);
2367-
}
2331+
mblen = mb_get_strlen(str, enc);
23682332
}
23692333

23702334
/* if "from" position is negative, count start position from the end

ext/mbstring/tests/mb_strstr.phpt

+8-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("あいうえおかきくけこ"), EUC_JP(
2626
var_dump(bin2hex(mb_strstr("\xdd\x00", "", false, 'UTF-8')));
2727
var_dump(bin2hex(mb_strstr("M\xff\xff\xff\x00", "\x00", false, "SJIS")));
2828

29+
// Test handling of invalid UTF-8 string
30+
// Thanks to Stefan Schiller
31+
var_dump(mb_strstr("\xf0start", "start", false, "UTF-8"));
32+
var_dump(mb_strstr("\xf0start", "start", true, "UTF-8"));
33+
2934
?>
3035
--EXPECT--
3136
string(18) "おかきくけこ"
@@ -36,5 +41,7 @@ string(12) "あいうえ"
3641
string(18) "おかきくけこ"
3742
string(18) "おかきくけこ"
3843
string(12) "あいうえ"
39-
string(4) "dd00"
44+
string(4) "3f00"
4045
string(2) "00"
46+
string(5) "start"
47+
string(1) "?"

ext/mbstring/tests/mb_substr.phpt

+25-14
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,15 @@ print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UT
118118
print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
119119
print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
120120

121+
echo "Testing agreement with mb_strpos on invalid UTF-8 string:\n";
122+
/* Stefan Schiller pointed out that on invalid UTF-8 strings, character indices returned
123+
* by mb_strpos would not extract the desired part of the string when passed to mb_substr.
124+
* This is the test case which he provided: */
125+
$data = "\xF0AAA<b>";
126+
$pos = mb_strpos($data, "<", 0, "UTF-8");
127+
$out = mb_substr($data, 0, $pos, "UTF-8");
128+
print $out . "\n";
129+
121130
echo "Regression:\n";
122131
/* During development, one >= comparison in mb_get_substr was wrongly written as >
123132
* This was caught by libFuzzer */
@@ -138,30 +147,30 @@ SJIS:
138147
4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
139148
5:
140149
-- Testing illegal SJIS byte 0x80 --
141-
6380
142-
806162
150+
633f
151+
3f6162
143152
SJIS-2004:
144-
6380
145-
806162
153+
633f
154+
3f6162
146155
MacJapanese:
147156
6380
148157
806162
149158
SJIS-Mobile#DOCOMO:
150-
6380
151-
806162
159+
633f
160+
3f6162
152161
SJIS-Mobile#KDDI:
153-
6380
154-
806162
162+
633f
163+
3f6162
155164
SJIS-Mobile#SoftBank:
156-
6380
157-
806162
165+
633f
166+
3f6162
158167
-- Testing MacJapanese characters which map to 3-5 codepoints each --
159168
616263
160-
85ab85ac
161-
85ac
169+
3f3f
170+
58
162171
616263
163-
85bf85c0
164-
85c0
172+
3f3f
173+
78
165174
ISO-2022-JP:
166175
1: 1b2442212121721b284241
167176
2: 43
@@ -200,5 +209,7 @@ UTF-7:
200209
3: йте
201210
4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
202211
5:
212+
Testing agreement with mb_strpos on invalid UTF-8 string:
213+
?AAA
203214
Regression:
204215
1b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842

0 commit comments

Comments
 (0)