Merge branch 'PHP-8.3'

alexdowad · alexdowad · commit fea895bb490b · 2023-12-10T15:04:14.000+02:00
* PHP-8.3:
  Character indices used by mb_strpos and mb_substr have same meaning, even on invalid strings
diff --git a/UPGRADING b/UPGRADING
@@ -35,6 +35,13 @@ PHP 8.4 UPGRADE NOTES
   . mb_encode_numericentity() and mb_decode_numericentity() now check that
     the $map is only composed of integers, if not a ValueError is thrown.
   . mb_http_input() now always throws a ValueError if the $type is invalid.
+  . On invalid strings (those with encoding errors), mb_substr() now interprets
+    character indices in the same manner as most other mbstring functions. This
+    means that character indices returned by mb_strpos() can be passed to mb_substr().
+  . For SJIS-Mac (MacJapanese) strings, character indices passed to mb_substr() now
+    refer to the indices of the Unicode codepoints which are produced when the string
+    is converted to Unicode. This is significant because around 40 SJIS-Mac characters
+    convert to a sequence of multiple Unicode codepoints.
 
 - PDO_DBLIB:
   . setAttribute, DBLIB_ATTR_STRINGIFY_UNIQUEIDENTIFIER and DBLIB_ATTR_DATETIME_CONVERT
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -38,6 +38,7 @@
 #include "libmbfl/mbfl/mbfilter_wchar.h"
 #include "libmbfl/mbfl/eaw_table.h"
 #include "libmbfl/filters/mbfilter_base64.h"
+#include "libmbfl/filters/mbfilter_cjk.h"
 #include "libmbfl/filters/mbfilter_qprint.h"
 #include "libmbfl/filters/mbfilter_htmlent.h"
 #include "libmbfl/filters/mbfilter_uuencode.h"
@@ -2112,8 +2113,9 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
 	size_t in_len = ZSTR_LEN(input);
 
-	if (from >= in_len || len == 0) {
-		/* No supported text encoding decodes to more than one codepoint per byte
+	if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
+		/* Other than MacJapanese, no supported text encoding decodes to
+		 * more than one codepoint per byte
 		 * So if the number of codepoints to skip >= number of input bytes,
 		 * then definitely the output should be empty */
 		return zend_empty_string;
@@ -2134,30 +2136,6 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
 			len = in_len;
 		}
 		return zend_string_init_fast((const char*)in, len);
-	} else if (enc->mblen_table) {
-		/* The use of the `mblen_table` means that for encodings like MacJapanese,
-		 * we treat each character in its native charset as "1 character", even if it
-		 * maps to a sequence of several codepoints */
-		const unsigned char *mbtab = enc->mblen_table;
-		unsigned char *limit = in + in_len;
-		while (from && in < limit) {
-			in += mbtab[*in];
-			from--;
-		}
-		if (in >= limit) {
-			return zend_empty_string;
-		} else if (len == MBFL_SUBSTR_UNTIL_END) {
-			return zend_string_init_fast((const char*)in, limit - in);
-		}
-		unsigned char *end = in;
-		while (len && end < limit) {
-			end += mbtab[*end];
-			len--;
-		}
-		if (end > limit) {
-			end = limit;
-		}
-		return zend_string_init_fast((const char*)in, end - in);
 	}
 
 	return mb_get_substr_slow(in, in_len, from, len, enc);
@@ -2350,21 +2328,7 @@ PHP_FUNCTION(mb_substr)
 
 	size_t mblen = 0;
 	if (from < 0 || (!len_is_null && len < 0)) {
-		if (enc->mblen_table) {
-			/* Because we use the `mblen_table` when iterating over the string and
-			 * extracting the requested part, we also need to use it here for counting
-			 * the "length" of the string
-			 * Otherwise, we can get wrong results for text encodings like MacJapanese,
-			 * where one native 'character' can map to a sequence of several codepoints */
-			const unsigned char *mbtab = enc->mblen_table;
-			unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
-			while (p < e) {
-				p += mbtab[*p];
-				mblen++;
-			}
-		} else {
-			mblen = mb_get_strlen(str, enc);
-		}
+		mblen = mb_get_strlen(str, enc);
 	}
 
 	/* if "from" position is negative, count start position from the end
diff --git a/ext/mbstring/tests/mb_strstr.phpt b/ext/mbstring/tests/mb_strstr.phpt
@@ -26,6 +26,11 @@ var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("&#12354;&#12356;&#12358;&#12360;&#12362;&#12363;&#12365;&#12367;&#12369;&#12371;"), EUC_JP(
 var_dump(bin2hex(mb_strstr("\xdd\x00", "", false, 'UTF-8')));
 var_dump(bin2hex(mb_strstr("M\xff\xff\xff\x00", "\x00", false, "SJIS")));
 
+// Test handling of invalid UTF-8 string
+// Thanks to Stefan Schiller
+var_dump(mb_strstr("\xf0start", "start", false, "UTF-8"));
+var_dump(mb_strstr("\xf0start", "start", true, "UTF-8"));
+
 ?>
 --EXPECT--
 string(18) "&#12362;&#12363;&#12365;&#12367;&#12369;&#12371;"
@@ -36,5 +41,7 @@ string(12) "&#12354;&#12356;&#12358;&#12360;"
 string(18) "&#12362;&#12363;&#12365;&#12367;&#12369;&#12371;"
 string(18) "&#12362;&#12363;&#12365;&#12367;&#12369;&#12371;"
 string(12) "&#12354;&#12356;&#12358;&#12360;"
-string(4) "dd00"
+string(4) "3f00"
 string(2) "00"
+string(5) "start"
+string(1) "?"
diff --git a/ext/mbstring/tests/mb_substr.phpt b/ext/mbstring/tests/mb_substr.phpt
@@ -118,6 +118,15 @@ print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UT
 print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
 print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
 
+echo "Testing agreement with mb_strpos on invalid UTF-8 string:\n";
+/* Stefan Schiller pointed out that on invalid UTF-8 strings, character indices returned
+ * by mb_strpos would not extract the desired part of the string when passed to mb_substr.
+ * This is the test case which he provided: */
+$data = "\xF0AAA<b>";
+$pos = mb_strpos($data, "<", 0, "UTF-8");
+$out = mb_substr($data, 0, $pos, "UTF-8");
+print $out . "\n";
+
 echo "Regression:\n";
 /* During development, one >= comparison in mb_get_substr was wrongly written as >
  * This was caught by libFuzzer */
@@ -138,30 +147,30 @@ SJIS:
 4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
 5:
 -- Testing illegal SJIS byte 0x80 --
-6380
-806162
+633f
+3f6162
 SJIS-2004:
-6380
-806162
+633f
+3f6162
 MacJapanese:
 6380
 806162
 SJIS-Mobile#DOCOMO:
-6380
-806162
+633f
+3f6162
 SJIS-Mobile#KDDI:
-6380
-806162
+633f
+3f6162
 SJIS-Mobile#SoftBank:
-6380
-806162
+633f
+3f6162
 -- Testing MacJapanese characters which map to 3-5 codepoints each --
 616263
-85ab85ac
-85ac
+3f3f
+58
 616263
-85bf85c0
-85c0
+3f3f
+78
 ISO-2022-JP:
 1: 1b2442212121721b284241
 2: 43
@@ -200,5 +209,7 @@ UTF-7:
 3: &#1081;&#1090;&#1077;
 4: reek: &Sigma;&#8050; &gamma;&nu;&omega;&rho;&#8055;&zeta;&omega; &#7936;&pi;&#8056; &tau;&#8052;&nu; &kappa;&#8057;&psi;&eta; Russian: &#1047;&#1072;&#1088;&#1077;&#1075;&#1080;&#1089;&#1090;&#1088;&#1080;&#1088;&#1091;&#1081;&#1090;&#1077;&#1089;&#1100;
 5:
+Testing agreement with mb_strpos on invalid UTF-8 string:
+?AAA
 Regression:
 1b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842