Fix infinite loop in mb_encode_mimeheader

alexdowad · ramsey · commit 3394efc63e52 · 2024-04-09T23:52:11.000-05:00
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -5858,6 +5858,9 @@ static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encodin
 	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
 	size_t in_len = ZSTR_LEN(input);
 
+	ZEND_ASSERT(outcode->mime_name != NULL);
+	ZEND_ASSERT(outcode->mime_name[0] != '\0');
+
 	if (!in_len) {
 		return zend_empty_string;
 	}
@@ -5880,7 +5883,8 @@ static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encodin
 	unsigned int state = 0;
 	/* wchar_buf should be big enough that when it is full, we definitely have enough
 	 * wchars to fill an entire line of output */
-	uint32_t wchar_buf[80];
+	const size_t wchar_buf_len = 90;
+	uint32_t wchar_buf[wchar_buf_len];
 	uint32_t *p, *e;
 	/* What part of wchar_buf is filled with still-unprocessed data which should not
 	 * be overwritten? */
@@ -5891,7 +5895,7 @@ static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encodin
 	 * spaces), just pass it through unchanged */
 	bool checking_leading_spaces = true;
 	while (in_len) {
-		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, 80, &state);
+		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, wchar_buf_len, &state);
 		p = wchar_buf;
 		e = wchar_buf + out_len;
 
@@ -5925,9 +5929,9 @@ no_passthrough: ;
 	 * do so all the way to the end of the string */
 	while (in_len) {
 		/* Decode part of the input string, refill wchar_buf */
-		ZEND_ASSERT(offset < 80);
-		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, 80 - offset, &state);
-		ZEND_ASSERT(out_len <= 80 - offset);
+		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= wchar_buf_len);
+		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, wchar_buf_len - offset, &state);
+		ZEND_ASSERT(out_len <= wchar_buf_len - offset);
 		p = wchar_buf;
 		e = wchar_buf + offset + out_len;
 		/* ASCII output is broken into space-delimited 'words'
@@ -5948,6 +5952,7 @@ no_passthrough: ;
 				 * If we are already too far along on a line to include Base64/QPrint encoded data
 				 * on the same line (without overrunning max line length), then add a line feed
 				 * right now */
+feed_and_mime_encode:
 				if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
 					MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
 					buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
@@ -5985,7 +5990,13 @@ no_passthrough: ;
 
 		if (in_len) {
 			/* Copy chars which are part of an incomplete 'word' to the beginning
-			 * of wchar_buf and reprocess them on the next iteration */
+			 * of wchar_buf and reprocess them on the next iteration.
+			 * But first make sure that the incomplete 'word' isn't so big that
+			 * there will be no space to add any more decoded wchars in the buffer
+			 * (which could lead to an infinite loop) */
+			if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
+				goto feed_and_mime_encode;
+			}
 			offset = e - word_start;
 			if (offset) {
 				memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
@@ -6027,17 +6038,17 @@ mime_encoding_needed: ;
 
 	/* Do we need to refill wchar_buf to make sure we don't run out of wchars
 	 * in the middle of a line? */
-	if (p == wchar_buf) {
+	offset = e - p;
+	if (wchar_buf_len - offset < MBSTRING_MIN_WCHAR_BUFSIZE) {
 		goto start_new_line;
 	}
-	offset = e - p;
 	memmove(wchar_buf, p, offset * sizeof(uint32_t));
 
 	while(true) {
 refill_wchar_buf: ;
-		ZEND_ASSERT(offset < 80);
-		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, 80 - offset, &state);
-		ZEND_ASSERT(out_len <= 80 - offset);
+		ZEND_ASSERT(offset + MBSTRING_MIN_WCHAR_BUFSIZE <= wchar_buf_len);
+		size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, wchar_buf_len - offset, &state);
+		ZEND_ASSERT(out_len <= wchar_buf_len - offset);
 		p = wchar_buf;
 		e = wchar_buf + offset + out_len;
 
@@ -6112,22 +6123,18 @@ start_new_line: ;
 
 					indent = 0; /* Indent argument must only affect the first line */
 
-					if (in_len) {
-						/* We still have more of input string remaining to decode */
+					if (in_len || p < e) {
+						/* We still have more input to process */
 						buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
 						buf.out = mb_convert_buf_add(buf.out, ' ');
 						line_start = mb_convert_buf_len(&buf);
-						/* Copy remaining wchars to beginning of buffer so they will be
-						 * processed on the next iteration of outer 'do' loop */
 						offset = e - p;
-						memmove(wchar_buf, p, offset * sizeof(uint32_t));
-						goto refill_wchar_buf;
-					} else if (p < e) {
-						/* Input string is finished, but we still have trailing wchars
-						 * remaining to be processed in wchar_buf */
-						buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
-						buf.out = mb_convert_buf_add(buf.out, ' ');
-						line_start = mb_convert_buf_len(&buf);
+						if (in_len && (wchar_buf_len - offset >= MBSTRING_MIN_WCHAR_BUFSIZE)) {
+							/* Copy any remaining wchars to beginning of buffer and refill
+							 * the rest of the buffer */
+							memmove(wchar_buf, p, offset * sizeof(uint32_t));
+							goto refill_wchar_buf;
+						}
 						goto start_new_line;
 					} else {
 						/* We are done! */
@@ -6165,7 +6172,7 @@ PHP_FUNCTION(mb_encode_mimeheader)
 		charset = php_mb_get_encoding(charset_name, 2);
 		if (!charset) {
 			RETURN_THROWS();
-		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
+		} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
 			zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
 			RETURN_THROWS();
 		}
diff --git a/ext/mbstring/tests/mb_encode_mimeheader_basic4.phpt b/ext/mbstring/tests/mb_encode_mimeheader_basic4.phpt
@@ -115,11 +115,29 @@ var_dump(mb_encode_mimeheader("\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
 // In the general case, matching the old implementation's decision to transfer-encode or not
 // perfectly would require allocating potentially unbounded scratch memory (up to the size of
 // the input string), but we aim to only use a constant amount of temporarily allocated memory
-var_dump(mb_encode_mimeheader("2\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20!3", "GB18030", "Q", ""));
+var_dump(mb_encode_mimeheader("2\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20!3", "GB18030", "Q", ""));
+
+// Regression test for infinite loop which was unintentionally caused when refactoring
+var_dump(mb_encode_mimeheader(",9868949,9868978,9869015,9689100,9869121,9869615,9870690,9867116,98558119861183. ", "utf-8", "B"));
+var_dump(mb_encode_mimeheader('xx ' . str_repeat("A", 81) . " ", "utf-8", "B"));
+
+// Regression test for problem where MIME encoding loop would not leave enough space in wchar
+// buffer for the next iteration, causing an assertion failure
+mb_internal_encoding('MacJapanese');
+var_dump(mb_encode_mimeheader("ne\xf6\xff\xff\xffs\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff1\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff1", 'CP50220', 'B', "A", 44));
+
+// Regression test for failing assertion caused by the fact that QPrint deliberately generates no
+// wchars for CR (0x0D) bytes
+try {
+	mb_internal_encoding('Quoted-Printable');
+	var_dump(mb_encode_mimeheader("=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=00=00=00=00=00=00=00=01=00=00=00=00=00=00=00850r=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=0D=00=00=00=0050r=08=0DCP850r850r0r", "Quoted-Printable", "B", "", 184));
+} catch (\ValueError $e) {
+	echo $e->getMessage() . \PHP_EOL;
+}
 
 echo "Done";
 ?>
---EXPECT--
+--EXPECTF--
 string(0) ""
 string(21) "=?UTF-8?Q?abc=00abc?="
 string(16) "=?UTF-8?B?Pw==?="
@@ -156,5 +174,14 @@ string(75) "  111111111111111111111111111111111111111111111111111111111111111111
 string(33) "=?HZ-GB-2312?Q?=7E=7Bs=5B=7E=7D?="
 string(77) "2                                                                          !3"
 string(282) "=?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20!=33=20?="
-string(296) "2 =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20!=33?="
+string(344) "2 =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20!=33?="
+string(135) "=?UTF-8?B?LDk4Njg5NDksOTg2ODk3OCw5ODY5MDE1LDk2ODkxMDAsOTg2OTEyMSw5ODY5?=
+ =?UTF-8?B?NjE1LDk4NzA2OTAsOTg2NzExNiw5ODU1ODExOTg2MTE4My4g?="
+string(142) "xx =?UTF-8?B?QUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFB?=
+ =?UTF-8?B?QUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBIA==?="
+string(690) "=?ISO-2022-JP?B?bmU/?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/cxskQiFEGyhCPw==?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/MRskQiFEGyhCPxskQiFEGyhCPw==?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/GyRCIUQbKEI/?=A =?ISO-2022-JP?B?GyRCIUQbKEI/GyRCIUQbKEI/MQ==?="
+
+
+Deprecated: mb_encode_mimeheader(): Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead in %s on line %d
+mb_encode_mimeheader(): Argument #2 ($charset) "Quoted-Printable" cannot be used for MIME header encoding
 Done