Skip to content

Commit 44e8301

Browse files
youkidearitaidevnexen
authored andcommitted
Add grapheme_str_split function
I noticed that PHP does not have a grapheme cluster based str_split function. So I created the grapheme_str_split function. This feature will allow you to correctly handle emoji and variable selectors. Co-authored-by: Ayesh Karunaratne <Ayesh@users.noreply.github.com> Close GH-13580
1 parent 78ccea4 commit 44e8301

File tree

6 files changed

+91
-1
lines changed

6 files changed

+91
-1
lines changed

NEWS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ PHP NEWS
8383
- ValueError if the integer index does not fit in a signed 32 bit integer
8484
. ResourceBundle::get() now has a tentative return type of:
8585
ResourceBundle|array|string|int|null
86+
. Added the new Grapheme function grapheme_str_split. (youkidearitai)
8687

8788
- LDAP:
8889
. Added LDAP_OPT_X_TLS_PROTOCOL_MAX/LDAP_OPT_X_TLS_PROTOCOL_TLS1_3

UPGRADING

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,8 @@ PHP 8.4 UPGRADE NOTES
435435
- Intl:
436436
. Added IntlDateFormatter::getIanaID()/intltz_get_iana_id() to
437437
the IANA identifier from a given timezone.
438+
. Added grapheme_str_split which allow to support emoji and Variation
439+
Selectors.
438440

439441
- MBString:
440442
. Added mb_trim, mb_ltrim and mb_rtrim functions.

ext/intl/grapheme/grapheme_string.c

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -816,4 +816,82 @@ PHP_FUNCTION(grapheme_extract)
816816
RETURN_STRINGL(((char *)pstr), ret_pos);
817817
}
818818

819+
PHP_FUNCTION(grapheme_str_split)
820+
{
821+
char *pstr, *end;
822+
zend_string *str;
823+
zend_long split_len = 1;
824+
825+
unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
826+
UErrorCode ustatus = U_ZERO_ERROR;
827+
int32_t pos, current, i, end_len = 0;
828+
UBreakIterator* bi;
829+
UText *ut = NULL;
830+
831+
ZEND_PARSE_PARAMETERS_START(1, 2)
832+
Z_PARAM_STR(str)
833+
Z_PARAM_OPTIONAL
834+
Z_PARAM_LONG(split_len)
835+
ZEND_PARSE_PARAMETERS_END();
836+
837+
if (split_len <= 0 || split_len > UINT_MAX / 4) {
838+
zend_argument_value_error(2, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
839+
RETURN_THROWS();
840+
}
841+
842+
if (ZSTR_LEN(str) == 0) {
843+
RETURN_EMPTY_ARRAY();
844+
}
845+
846+
pstr = ZSTR_VAL(str);
847+
ut = utext_openUTF8(ut, pstr, ZSTR_LEN(str), &ustatus);
848+
849+
if ( U_FAILURE( ustatus ) ) {
850+
/* Set global error code. */
851+
intl_error_set_code( NULL, ustatus );
852+
853+
/* Set error messages. */
854+
intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
855+
856+
RETURN_FALSE;
857+
}
858+
859+
bi = NULL;
860+
ustatus = U_ZERO_ERROR;
861+
bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &ustatus );
862+
863+
if( U_FAILURE(ustatus) ) {
864+
RETURN_FALSE;
865+
}
866+
867+
ubrk_setUText(bi, ut, &ustatus);
868+
869+
pos = 0;
870+
array_init(return_value);
871+
872+
for (end = pstr, i = 0, current = 0; pos != UBRK_DONE;) {
873+
end_len = pos - current;
874+
pos = ubrk_next(bi);
875+
876+
if (i == split_len - 1) {
877+
if ( pos != UBRK_DONE ) {
878+
add_next_index_stringl(return_value, pstr, pos - current);
879+
end = pstr + pos - current;
880+
i = 0;
881+
}
882+
pstr += pos - current;
883+
current = pos;
884+
} else {
885+
i += 1;
886+
}
887+
}
888+
889+
if (i != 0 && end_len != 0) {
890+
add_next_index_stringl(return_value, end, end_len);
891+
}
892+
893+
utext_close(ut);
894+
ubrk_close(bi);
895+
}
896+
819897
/* }}} */

ext/intl/php_intl.stub.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,8 @@ function grapheme_strstr(string $haystack, string $needle, bool $beforeNeedle =
445445

446446
function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = false): string|false {}
447447

448+
function grapheme_str_split(string $string, int $length = 1): array|false {}
449+
448450
/** @param int $next */
449451
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}
450452

ext/intl/php_intl_arginfo.h

Lines changed: 8 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
1.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)