Skip to content

Commit 81e236c

Browse files
committed
Fix infinite loop when mb_detect_encoding is used on UTF-8 BOM
This bug was introduced in cb84079. Thanks to Ignace Nyamagana Butera for discovering this bug and to Sebastian Bergmann for doing an initial investigation and opening a bug ticket.
1 parent 43064ca commit 81e236c

File tree

2 files changed

+19
-0
lines changed

2 files changed

+19
-0
lines changed

ext/mbstring/mbstring.c

+6
Original file line numberDiff line numberDiff line change
@@ -3068,6 +3068,12 @@ static size_t count_demerits(struct candidate *array, size_t length, bool strict
30683068
uint32_t wchar_buf[128];
30693069
unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
30703070

3071+
for (size_t i = 0; i < length; i++) {
3072+
if (array[i].in_len == 0) {
3073+
finished++;
3074+
}
3075+
}
3076+
30713077
while ((strict || length > 1) && finished < length) {
30723078
/* Iterate in reverse order to avoid moving candidates that can be eliminated. */
30733079
for (size_t i = length - 1; i != (size_t)-1; i--) {

ext/mbstring/tests/mb_detect_encoding.phpt

+13
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,13 @@ print("Bad ASCII (strict): " . mb_detect_encoding("\xDD\x92", ['ASCII', 'UTF-8']
2525
print("Bad ASCII/UTF-8, with more errors for ASCII (non-strict): " . mb_detect_encoding("\xD6\x8A\x8A", ['ASCII', 'UTF-8'], false) . "\n");
2626
print("Bad ASCII/UTF-8, with more errors for ASCII (strict): " . var_export(mb_detect_encoding("\xD6\x8A\x8A", ['ASCII', 'UTF-8'], true), true) . "\n");
2727

28+
print("UTF-8 BOM (non-strict): " . mb_detect_encoding("\xEF\xBB\xBF", ["UTF-8", "ASCII"], false) . "\n");
29+
print("UTF-8 BOM (strict): " . mb_detect_encoding("\xEF\xBB\xBF", ["UTF-8", "ASCII"], true) . "\n");
30+
print("UTF-16BE BOM (non-strict): " . mb_detect_encoding("\xFE\xFF", ["UTF-8", "UTF-16BE", "UTF-16LE"], false) . "\n");
31+
print("UTF-16BE BOM (strict): " . mb_detect_encoding("\xFE\xFF", ["UTF-8", "UTF-16BE", "UTF-16LE"], true) . "\n");
32+
print("UTF-16LE BOM (non-strict): " . mb_detect_encoding("\xFF\xFE", ["UTF-8", "UTF-16BE", "UTF-16LE"], false) . "\n");
33+
print("UTF-16LE BOM (strict): " . mb_detect_encoding("\xFF\xFE", ["UTF-8", "UTF-16BE", "UTF-16LE"], true) . "\n");
34+
2835
print("SJIS: " . mb_detect_encoding($sjis, 'SJIS', true) . "\n");
2936
print("JIS: " . mb_detect_encoding($jis, 'JIS', true) . "\n");
3037
print("EUC-JP (strict): " . mb_detect_encoding($euc_jp, 'UTF-8,EUC-JP,JIS', true) . "\n");
@@ -399,6 +406,12 @@ Bad ASCII (non-strict): UTF-8
399406
Bad ASCII (strict): UTF-8
400407
Bad ASCII/UTF-8, with more errors for ASCII (non-strict): UTF-8
401408
Bad ASCII/UTF-8, with more errors for ASCII (strict): false
409+
UTF-8 BOM (non-strict): UTF-8
410+
UTF-8 BOM (strict): UTF-8
411+
UTF-16BE BOM (non-strict): UTF-16BE
412+
UTF-16BE BOM (strict): UTF-16BE
413+
UTF-16LE BOM (non-strict): UTF-16LE
414+
UTF-16LE BOM (strict): UTF-16LE
402415
SJIS: SJIS
403416
JIS: JIS
404417
EUC-JP (strict): EUC-JP

0 commit comments

Comments
 (0)